Spaces:
Runtime error
Runtime error
Merge branch 'main' into pr/15
Browse files- app.py +0 -1
- backend-cli.py +53 -20
- src/backend/envs.py +1 -1
- src/backend/hflm_with_measurement.py +5 -5
- src/backend/manage_requests.py +4 -5
- src/backend/run_eval_suite.py +1 -1
- src/display/utils.py +32 -3
- src/leaderboard/read_evals.py +11 -0
- src/populate.py +5 -7
- src/utils.py +101 -0
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
-
|
| 3 |
import os
|
| 4 |
import datetime
|
| 5 |
import socket
|
|
|
|
| 1 |
#!/usr/bin/env python
|
|
|
|
| 2 |
import os
|
| 3 |
import datetime
|
| 4 |
import socket
|
backend-cli.py
CHANGED
|
@@ -6,6 +6,7 @@ import argparse
|
|
| 6 |
|
| 7 |
import socket
|
| 8 |
import random
|
|
|
|
| 9 |
from datetime import datetime
|
| 10 |
|
| 11 |
from src.backend.run_eval_suite import run_evaluation
|
|
@@ -16,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
|
|
| 16 |
from src.leaderboard.read_evals import EvalResult
|
| 17 |
|
| 18 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
| 19 |
-
from src.utils import my_snapshot_download
|
| 20 |
|
| 21 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 22 |
|
|
@@ -123,7 +124,17 @@ def request_to_result_name(request: EvalRequest) -> str:
|
|
| 123 |
|
| 124 |
|
| 125 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
| 126 |
-
batch_size =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
try:
|
| 128 |
results = run_evaluation(
|
| 129 |
eval_request=eval_request,
|
|
@@ -150,6 +161,20 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
|
|
| 150 |
raise
|
| 151 |
|
| 152 |
# print("RESULTS", results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
| 155 |
# print(dumped)
|
|
@@ -396,9 +421,9 @@ def get_args():
|
|
| 396 |
parser = argparse.ArgumentParser(description="Run the backend")
|
| 397 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
| 398 |
# debug parameters
|
| 399 |
-
parser.add_argument("--task", type=str, default="selfcheckgpt", help="Task to debug")
|
| 400 |
-
parser.add_argument("--model", type=str, default="
|
| 401 |
-
parser.add_argument("--precision", type=str, default="float16", help="Precision to debug")
|
| 402 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
| 403 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
| 404 |
return parser.parse_args()
|
|
@@ -409,23 +434,31 @@ if __name__ == "__main__":
|
|
| 409 |
local_debug = args.debug
|
| 410 |
# debug specific task by ping
|
| 411 |
if local_debug:
|
| 412 |
-
debug_model_names = [args.model] # Use model from arguments
|
| 413 |
-
debug_task_name = args.task # Use task from arguments
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
task_lst = TASKS_HARNESS.copy()
|
| 415 |
-
for
|
| 416 |
for debug_model_name in debug_model_names:
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
else:
|
| 430 |
while True:
|
| 431 |
res = False
|
|
|
|
| 6 |
|
| 7 |
import socket
|
| 8 |
import random
|
| 9 |
+
import threading
|
| 10 |
from datetime import datetime
|
| 11 |
|
| 12 |
from src.backend.run_eval_suite import run_evaluation
|
|
|
|
| 17 |
from src.leaderboard.read_evals import EvalResult
|
| 18 |
|
| 19 |
from src.envs import QUEUE_REPO, RESULTS_REPO, API
|
| 20 |
+
from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
|
| 21 |
|
| 22 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 23 |
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
|
| 127 |
+
batch_size = 1
|
| 128 |
+
batch_size = eval_request.batch_size
|
| 129 |
+
|
| 130 |
+
init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
|
| 131 |
+
# if init_gpu_info['Mem(M)'] > 500:
|
| 132 |
+
# assert False, f"This machine is not empty: {init_gpu_info}"
|
| 133 |
+
gpu_stats_list = []
|
| 134 |
+
stop_event = threading.Event()
|
| 135 |
+
monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
|
| 136 |
+
monitor_thread.start()
|
| 137 |
+
|
| 138 |
try:
|
| 139 |
results = run_evaluation(
|
| 140 |
eval_request=eval_request,
|
|
|
|
| 161 |
raise
|
| 162 |
|
| 163 |
# print("RESULTS", results)
|
| 164 |
+
stop_event.set()
|
| 165 |
+
monitor_thread.join()
|
| 166 |
+
gpu_info = analyze_gpu_stats(gpu_stats_list)
|
| 167 |
+
for task_name in results['results'].keys():
|
| 168 |
+
for key, value in gpu_info.items():
|
| 169 |
+
if "GPU" not in key:
|
| 170 |
+
results['results'][task_name][f"{key},none"] = int(value)
|
| 171 |
+
else:
|
| 172 |
+
results['results'][task_name][f"{key},none"] = value
|
| 173 |
+
|
| 174 |
+
results['results'][task_name]['batch_size,none'] = batch_size
|
| 175 |
+
results['results'][task_name]['precision,none'] = eval_request.precision
|
| 176 |
+
print(f"gpu_stats_list: {gpu_stats_list}")
|
| 177 |
+
print("GPU Usage:", gpu_info)
|
| 178 |
|
| 179 |
dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
|
| 180 |
# print(dumped)
|
|
|
|
| 421 |
parser = argparse.ArgumentParser(description="Run the backend")
|
| 422 |
parser.add_argument("--debug", action="store_true", help="Run in debug mode")
|
| 423 |
# debug parameters
|
| 424 |
+
parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu", help="Task to debug")
|
| 425 |
+
parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
|
| 426 |
+
parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
|
| 427 |
parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
|
| 428 |
parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
|
| 429 |
return parser.parse_args()
|
|
|
|
| 434 |
local_debug = args.debug
|
| 435 |
# debug specific task by ping
|
| 436 |
if local_debug:
|
| 437 |
+
# debug_model_names = [args.model] # Use model from arguments
|
| 438 |
+
# debug_task_name = [args.task] # Use task from arguments
|
| 439 |
+
debug_model_names = args.model.split(",")
|
| 440 |
+
debug_task_name = args.task.split(",")
|
| 441 |
+
precisions = args.precision.split(",")
|
| 442 |
+
print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
|
| 443 |
task_lst = TASKS_HARNESS.copy()
|
| 444 |
+
for precision in precisions:
|
| 445 |
for debug_model_name in debug_model_names:
|
| 446 |
+
for task in task_lst:
|
| 447 |
+
task_name = task.benchmark
|
| 448 |
+
if task_name not in debug_task_name:
|
| 449 |
+
continue
|
| 450 |
+
try:
|
| 451 |
+
eval_request = EvalRequest(
|
| 452 |
+
model=debug_model_name,
|
| 453 |
+
private=False,
|
| 454 |
+
status="",
|
| 455 |
+
json_filepath="",
|
| 456 |
+
precision=precision, # Use precision from arguments
|
| 457 |
+
inference_framework=args.inference_framework # Use inference framework from arguments
|
| 458 |
+
)
|
| 459 |
+
results = process_evaluation(task, eval_request, limit=args.limit)
|
| 460 |
+
except Exception as e:
|
| 461 |
+
print(f"debug running error: {e}")
|
| 462 |
else:
|
| 463 |
while True:
|
| 464 |
res = False
|
src/backend/envs.py
CHANGED
|
@@ -63,4 +63,4 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
| 63 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
| 64 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 65 |
|
| 66 |
-
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 63 |
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
|
| 64 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 65 |
|
| 66 |
+
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
|
src/backend/hflm_with_measurement.py
CHANGED
|
@@ -57,12 +57,12 @@ class StopWatch(TextStreamer):
|
|
| 57 |
self.start_decoding = time()
|
| 58 |
self.decoding_iterations += 1
|
| 59 |
return
|
| 60 |
-
|
| 61 |
def end(self):
|
| 62 |
if self.decoding_time is None and self.start_decoding is not None:
|
| 63 |
self.decoding_time = time() - self.start_decoding
|
| 64 |
return
|
| 65 |
-
|
| 66 |
|
| 67 |
class HFLMWithMeasurement(HFLM):
|
| 68 |
def __init__(self, **kwargs):
|
|
@@ -287,7 +287,7 @@ class HFLMWithMeasurement(HFLM):
|
|
| 287 |
pbar.close()
|
| 288 |
|
| 289 |
return re_ord.get_original(res)
|
| 290 |
-
|
| 291 |
def _model_generate(self, context, max_length, stop, **generation_kwargs):
|
| 292 |
# temperature = 0.0 if not set
|
| 293 |
# if do_sample is false and temp==0.0:
|
|
@@ -318,7 +318,7 @@ class HFLMWithMeasurement(HFLM):
|
|
| 318 |
**generation_kwargs,
|
| 319 |
)
|
| 320 |
end = time()
|
| 321 |
-
|
| 322 |
batch_size = context.shape[0]
|
| 323 |
output_length = stop_watch.decoding_iterations
|
| 324 |
|
|
@@ -403,7 +403,7 @@ class HFLMWithMeasurement(HFLM):
|
|
| 403 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
| 404 |
)
|
| 405 |
# add EOS token to stop sequences
|
| 406 |
-
eos = self.tok_decode(self.eot_token_id
|
| 407 |
if not until:
|
| 408 |
until = [eos]
|
| 409 |
else:
|
|
|
|
| 57 |
self.start_decoding = time()
|
| 58 |
self.decoding_iterations += 1
|
| 59 |
return
|
| 60 |
+
|
| 61 |
def end(self):
|
| 62 |
if self.decoding_time is None and self.start_decoding is not None:
|
| 63 |
self.decoding_time = time() - self.start_decoding
|
| 64 |
return
|
| 65 |
+
|
| 66 |
|
| 67 |
class HFLMWithMeasurement(HFLM):
|
| 68 |
def __init__(self, **kwargs):
|
|
|
|
| 287 |
pbar.close()
|
| 288 |
|
| 289 |
return re_ord.get_original(res)
|
| 290 |
+
|
| 291 |
def _model_generate(self, context, max_length, stop, **generation_kwargs):
|
| 292 |
# temperature = 0.0 if not set
|
| 293 |
# if do_sample is false and temp==0.0:
|
|
|
|
| 318 |
**generation_kwargs,
|
| 319 |
)
|
| 320 |
end = time()
|
| 321 |
+
|
| 322 |
batch_size = context.shape[0]
|
| 323 |
output_length = stop_watch.decoding_iterations
|
| 324 |
|
|
|
|
| 403 |
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
|
| 404 |
)
|
| 405 |
# add EOS token to stop sequences
|
| 406 |
+
eos = self.tok_decode(self.eot_token_id)
|
| 407 |
if not until:
|
| 408 |
until = [eos]
|
| 409 |
else:
|
src/backend/manage_requests.py
CHANGED
|
@@ -27,24 +27,23 @@ class EvalRequest:
|
|
| 27 |
likes: Optional[int] = 0
|
| 28 |
params: Optional[int] = None
|
| 29 |
license: Optional[str] = ""
|
|
|
|
| 30 |
|
| 31 |
def get_model_args(self) -> str:
|
| 32 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
| 33 |
-
|
| 34 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
| 35 |
model_args += f",dtype={self.precision}"
|
| 36 |
# Quantized models need some added config, the install of bits and bytes, etc
|
| 37 |
# elif self.precision == "8bit":
|
| 38 |
# model_args += ",load_in_8bit=True"
|
| 39 |
-
|
| 40 |
-
|
| 41 |
# elif self.precision == "GPTQ":
|
| 42 |
# A GPTQ model does not need dtype to be specified,
|
| 43 |
# it will be inferred from the config
|
| 44 |
-
pass
|
| 45 |
elif self.precision == "8bit":
|
| 46 |
model_args += ",load_in_8bit=True"
|
| 47 |
-
model_args += ",trust_remote_code=True"
|
| 48 |
else:
|
| 49 |
raise Exception(f"Unknown precision {self.precision}.")
|
| 50 |
return model_args
|
|
|
|
| 27 |
likes: Optional[int] = 0
|
| 28 |
params: Optional[int] = None
|
| 29 |
license: Optional[str] = ""
|
| 30 |
+
batch_size: Optional[int] = 1
|
| 31 |
|
| 32 |
def get_model_args(self) -> str:
|
| 33 |
model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
|
| 34 |
+
model_args += ",trust_remote_code=True,device_map=auto"
|
| 35 |
if self.precision in ["float16", "float32", "bfloat16"]:
|
| 36 |
model_args += f",dtype={self.precision}"
|
| 37 |
# Quantized models need some added config, the install of bits and bytes, etc
|
| 38 |
# elif self.precision == "8bit":
|
| 39 |
# model_args += ",load_in_8bit=True"
|
| 40 |
+
elif self.precision == "4bit":
|
| 41 |
+
model_args += ",load_in_4bit=True"
|
| 42 |
# elif self.precision == "GPTQ":
|
| 43 |
# A GPTQ model does not need dtype to be specified,
|
| 44 |
# it will be inferred from the config
|
|
|
|
| 45 |
elif self.precision == "8bit":
|
| 46 |
model_args += ",load_in_8bit=True"
|
|
|
|
| 47 |
else:
|
| 48 |
raise Exception(f"Unknown precision {self.precision}.")
|
| 49 |
return model_args
|
src/backend/run_eval_suite.py
CHANGED
|
@@ -13,7 +13,7 @@ orig_higher_is_better = ConfigurableTask.higher_is_better
|
|
| 13 |
def process_results_decorator(func):
|
| 14 |
def wrapper(self, doc, results, *args, **kwargs):
|
| 15 |
processed_results = [r[0] for r in results]
|
| 16 |
-
|
| 17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
| 13 |
def process_results_decorator(func):
|
| 14 |
def wrapper(self, doc, results, *args, **kwargs):
|
| 15 |
processed_results = [r[0] for r in results]
|
| 16 |
+
|
| 17 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 18 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 19 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
src/display/utils.py
CHANGED
|
@@ -13,6 +13,29 @@ TS = "T/s" #Decoding throughput (tok/s)
|
|
| 13 |
InFrame = "Method" #"Inference framework"
|
| 14 |
MULTIPLE_CHOICEs = ["mmlu"]
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
@dataclass
|
| 17 |
class Task:
|
| 18 |
benchmark: str
|
|
@@ -81,11 +104,17 @@ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnConten
|
|
| 81 |
for task in Tasks:
|
| 82 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 83 |
# System performance metrics
|
| 84 |
-
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
| 86 |
continue
|
| 87 |
-
auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}
|
| 88 |
-
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}
|
|
|
|
| 89 |
|
| 90 |
# Model information
|
| 91 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
|
|
|
| 13 |
InFrame = "Method" #"Inference framework"
|
| 14 |
MULTIPLE_CHOICEs = ["mmlu"]
|
| 15 |
|
| 16 |
+
GPU_TEMP = 'Temp(C)'
|
| 17 |
+
GPU_Power = 'Power(W)'
|
| 18 |
+
GPU_Mem = 'Mem(G)'
|
| 19 |
+
GPU_Name = "GPU"
|
| 20 |
+
GPU_Util = 'Util(%)'
|
| 21 |
+
BATCH_SIZE = 'bs'
|
| 22 |
+
PRECISION = "Precision"
|
| 23 |
+
system_metrics_to_name_map = {
|
| 24 |
+
"end_to_end_time": f"{E2Es}",
|
| 25 |
+
"prefilling_time": f"{PREs}",
|
| 26 |
+
"decoding_throughput": f"{TS}",
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
gpu_metrics_to_name_map = {
|
| 30 |
+
GPU_Util: GPU_Util,
|
| 31 |
+
GPU_TEMP: GPU_TEMP,
|
| 32 |
+
GPU_Power: GPU_Power,
|
| 33 |
+
GPU_Mem: GPU_Mem,
|
| 34 |
+
"batch_size": BATCH_SIZE,
|
| 35 |
+
"precision": PRECISION,
|
| 36 |
+
GPU_Name: GPU_Name,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
@dataclass
|
| 40 |
class Task:
|
| 41 |
benchmark: str
|
|
|
|
| 104 |
for task in Tasks:
|
| 105 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 106 |
# System performance metrics
|
| 107 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True)])
|
| 108 |
+
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True)])
|
| 109 |
+
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True)])
|
| 110 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True)])
|
| 111 |
+
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True)])
|
| 112 |
+
auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True)])
|
| 113 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
| 114 |
continue
|
| 115 |
+
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False)])
|
| 116 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True)])
|
| 117 |
+
|
| 118 |
|
| 119 |
# Model information
|
| 120 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -103,6 +103,13 @@ class EvalResult:
|
|
| 103 |
|
| 104 |
if to_add is True:
|
| 105 |
multiplier = 100.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if "rouge" in metric and "truthful" not in benchmark:
|
| 107 |
multiplier = 1.0
|
| 108 |
if "squad" in benchmark:
|
|
@@ -111,6 +118,10 @@ class EvalResult:
|
|
| 111 |
multiplier = 1.0
|
| 112 |
if "throughput" in metric:
|
| 113 |
multiplier = 1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
# print('RESULTS', data['results'])
|
| 115 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 116 |
results[benchmark][metric] = value * multiplier
|
|
|
|
| 103 |
|
| 104 |
if to_add is True:
|
| 105 |
multiplier = 100.0
|
| 106 |
+
if "GPU" in metric:
|
| 107 |
+
results[benchmark][metric] = value
|
| 108 |
+
continue
|
| 109 |
+
if "precision" in metric:
|
| 110 |
+
results[benchmark][metric] = value
|
| 111 |
+
continue
|
| 112 |
+
|
| 113 |
if "rouge" in metric and "truthful" not in benchmark:
|
| 114 |
multiplier = 1.0
|
| 115 |
if "squad" in benchmark:
|
|
|
|
| 118 |
multiplier = 1.0
|
| 119 |
if "throughput" in metric:
|
| 120 |
multiplier = 1.0
|
| 121 |
+
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
| 122 |
+
multiplier = 1
|
| 123 |
+
|
| 124 |
+
|
| 125 |
# print('RESULTS', data['results'])
|
| 126 |
# print('XXX', benchmark, metric, value, multiplier)
|
| 127 |
results[benchmark][metric] = value * multiplier
|
src/populate.py
CHANGED
|
@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
|
|
| 12 |
|
| 13 |
from src.backend.envs import Tasks as BackendTasks
|
| 14 |
from src.display.utils import Tasks
|
| 15 |
-
from src.display.utils import
|
| 16 |
|
| 17 |
def get_leaderboard_df(
|
| 18 |
results_path: str,
|
|
@@ -45,12 +45,7 @@ def get_leaderboard_df(
|
|
| 45 |
bm = (task.benchmark, task.metric)
|
| 46 |
name_to_bm_map[name] = bm
|
| 47 |
|
| 48 |
-
|
| 49 |
-
system_metrics_to_name_map = {
|
| 50 |
-
"end_to_end_time": f"{E2Es}",
|
| 51 |
-
"prefilling_time": f"{PREs}",
|
| 52 |
-
"decoding_throughput": f"{TS}",
|
| 53 |
-
}
|
| 54 |
|
| 55 |
all_data_json = []
|
| 56 |
for entry in all_data_json_:
|
|
@@ -63,6 +58,9 @@ def get_leaderboard_df(
|
|
| 63 |
if sys_metric in entry[k]:
|
| 64 |
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
| 65 |
|
|
|
|
|
|
|
|
|
|
| 66 |
all_data_json += [new_entry]
|
| 67 |
|
| 68 |
# all_data_json.append(baseline_row)
|
|
|
|
| 12 |
|
| 13 |
from src.backend.envs import Tasks as BackendTasks
|
| 14 |
from src.display.utils import Tasks
|
| 15 |
+
from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map
|
| 16 |
|
| 17 |
def get_leaderboard_df(
|
| 18 |
results_path: str,
|
|
|
|
| 45 |
bm = (task.benchmark, task.metric)
|
| 46 |
name_to_bm_map[name] = bm
|
| 47 |
|
| 48 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
all_data_json = []
|
| 51 |
for entry in all_data_json_:
|
|
|
|
| 58 |
if sys_metric in entry[k]:
|
| 59 |
new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
|
| 60 |
|
| 61 |
+
for gpu_metric, metric_namne in gpu_metrics_to_name_map.items():
|
| 62 |
+
if gpu_metric in entry[k]:
|
| 63 |
+
new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric]
|
| 64 |
all_data_json += [new_entry]
|
| 65 |
|
| 66 |
# all_data_json.append(baseline_row)
|
src/utils.py
CHANGED
|
@@ -1,6 +1,14 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
from huggingface_hub import snapshot_download
|
|
|
|
|
|
|
|
|
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
| 6 |
for i in range(10):
|
|
@@ -32,3 +40,96 @@ def get_dataset_summary_table(file_path):
|
|
| 32 |
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
|
| 33 |
|
| 34 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
from huggingface_hub import snapshot_download
|
| 3 |
+
import subprocess
|
| 4 |
+
import re
|
| 5 |
+
import os
|
| 6 |
|
| 7 |
+
try:
|
| 8 |
+
from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
| 9 |
+
except:
|
| 10 |
+
print("local debug: from display.utils")
|
| 11 |
+
from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
|
| 12 |
|
| 13 |
def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
|
| 14 |
for i in range(10):
|
|
|
|
| 40 |
df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
|
| 41 |
|
| 42 |
return df
|
| 43 |
+
|
| 44 |
+
def parse_nvidia_smi():
|
| 45 |
+
visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
|
| 46 |
+
if visible_devices is not None:
|
| 47 |
+
gpu_indices = visible_devices.split(',')
|
| 48 |
+
else:
|
| 49 |
+
# Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
|
| 50 |
+
result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
|
| 51 |
+
if result.returncode != 0:
|
| 52 |
+
print("Failed to query GPU indices.")
|
| 53 |
+
return []
|
| 54 |
+
gpu_indices = result.stdout.strip().split('\n')
|
| 55 |
+
print(f"gpu_indices: {gpu_indices}")
|
| 56 |
+
gpu_stats = []
|
| 57 |
+
|
| 58 |
+
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
| 59 |
+
gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
|
| 60 |
+
|
| 61 |
+
gpu_name = ""
|
| 62 |
+
for index in gpu_indices:
|
| 63 |
+
result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
|
| 64 |
+
output = result.stdout.strip()
|
| 65 |
+
lines = output.split("\n")
|
| 66 |
+
for line in lines:
|
| 67 |
+
match = gpu_info_pattern.search(line)
|
| 68 |
+
name_match = gpu_name_pattern.search(line)
|
| 69 |
+
gpu_info = {}
|
| 70 |
+
if name_match:
|
| 71 |
+
gpu_name = name_match.group(1).strip()
|
| 72 |
+
if match:
|
| 73 |
+
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
| 74 |
+
gpu_info.update({
|
| 75 |
+
GPU_TEMP: temp,
|
| 76 |
+
GPU_Power: power_usage,
|
| 77 |
+
GPU_Mem: round(mem_usage / 1024, 2),
|
| 78 |
+
GPU_Util: gpu_util
|
| 79 |
+
})
|
| 80 |
+
|
| 81 |
+
if len(gpu_info) >= 4:
|
| 82 |
+
gpu_stats.append(gpu_info)
|
| 83 |
+
print(f"gpu_stats: {gpu_stats}")
|
| 84 |
+
gpu_name = f"{len(gpu_stats)}x{gpu_name}"
|
| 85 |
+
gpu_stats_total = {
|
| 86 |
+
GPU_TEMP: 0,
|
| 87 |
+
GPU_Power: 0,
|
| 88 |
+
GPU_Mem: 0,
|
| 89 |
+
GPU_Util: 0,
|
| 90 |
+
GPU_Name: gpu_name
|
| 91 |
+
}
|
| 92 |
+
for gpu_stat in gpu_stats:
|
| 93 |
+
gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
|
| 94 |
+
gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
|
| 95 |
+
gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
|
| 96 |
+
gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
|
| 97 |
+
gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
|
| 98 |
+
gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
|
| 99 |
+
gpu_stats_total[GPU_Power] /= len(gpu_stats)
|
| 100 |
+
gpu_stats_total[GPU_Util] /= len(gpu_stats)
|
| 101 |
+
return [gpu_stats_total]
|
| 102 |
+
|
| 103 |
+
def monitor_gpus(stop_event, interval, stats_list):
|
| 104 |
+
while not stop_event.is_set():
|
| 105 |
+
gpu_stats = parse_nvidia_smi()
|
| 106 |
+
if gpu_stats:
|
| 107 |
+
stats_list.extend(gpu_stats)
|
| 108 |
+
stop_event.wait(interval)
|
| 109 |
+
|
| 110 |
+
def analyze_gpu_stats(stats_list):
|
| 111 |
+
# Check if the stats_list is empty, and return None if it is
|
| 112 |
+
if not stats_list:
|
| 113 |
+
return None
|
| 114 |
+
|
| 115 |
+
# Initialize dictionaries to store the stats
|
| 116 |
+
avg_stats = {}
|
| 117 |
+
max_stats = {}
|
| 118 |
+
|
| 119 |
+
# Calculate average stats, excluding 'GPU_Mem'
|
| 120 |
+
for key in stats_list[0].keys():
|
| 121 |
+
if key != GPU_Mem and key != GPU_Name:
|
| 122 |
+
total = sum(d[key] for d in stats_list)
|
| 123 |
+
avg_stats[key] = total / len(stats_list)
|
| 124 |
+
|
| 125 |
+
# Calculate max stats for 'GPU_Mem'
|
| 126 |
+
max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
|
| 127 |
+
if GPU_Name in stats_list[0]:
|
| 128 |
+
avg_stats[GPU_Name] = stats_list[0][GPU_Name]
|
| 129 |
+
# Update average stats with max GPU memory usage
|
| 130 |
+
avg_stats.update(max_stats)
|
| 131 |
+
|
| 132 |
+
return avg_stats
|
| 133 |
+
|
| 134 |
+
if __name__ == "__main__":
|
| 135 |
+
print(analyze_gpu_stats(parse_nvidia_smi()))
|