Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Use preprocessed table dataset (WIP)
Browse files- app.py +3 -14
- src/envs.py +1 -2
- src/leaderboard/read_evals.py +0 -233
- src/populate.py +28 -7
app.py
CHANGED
|
@@ -37,7 +37,7 @@ from src.display.utils import (
|
|
| 37 |
VllmVersion,
|
| 38 |
fields,
|
| 39 |
)
|
| 40 |
-
from src.envs import API,
|
| 41 |
from src.i18n import (
|
| 42 |
CITATION_ACCORDION_LABEL,
|
| 43 |
CITATION_ACCORDION_LABEL_JA,
|
|
@@ -68,17 +68,6 @@ try:
|
|
| 68 |
)
|
| 69 |
except Exception:
|
| 70 |
restart_space()
|
| 71 |
-
try:
|
| 72 |
-
print(EVAL_RESULTS_PATH)
|
| 73 |
-
snapshot_download(
|
| 74 |
-
repo_id=RESULTS_REPO,
|
| 75 |
-
local_dir=EVAL_RESULTS_PATH,
|
| 76 |
-
repo_type="dataset",
|
| 77 |
-
tqdm_class=None,
|
| 78 |
-
etag_timeout=30,
|
| 79 |
-
)
|
| 80 |
-
except Exception:
|
| 81 |
-
restart_space()
|
| 82 |
|
| 83 |
|
| 84 |
# Get dataframes
|
|
@@ -90,7 +79,7 @@ except Exception:
|
|
| 90 |
FAILED_EVAL_QUEUE_DF,
|
| 91 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 92 |
|
| 93 |
-
ORIGINAL_DF = get_leaderboard_df(
|
| 94 |
MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
|
| 95 |
|
| 96 |
|
|
@@ -316,7 +305,7 @@ def plot_size_vs_score(df_filtered: pd.DataFrame) -> go.Figure:
|
|
| 316 |
df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
|
| 317 |
df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
|
| 318 |
df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
|
| 319 |
-
df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"] + "-shot)"
|
| 320 |
df = pd.melt(
|
| 321 |
df,
|
| 322 |
id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
|
|
|
|
| 37 |
VllmVersion,
|
| 38 |
fields,
|
| 39 |
)
|
| 40 |
+
from src.envs import API, CONTENTS_REPO, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID
|
| 41 |
from src.i18n import (
|
| 42 |
CITATION_ACCORDION_LABEL,
|
| 43 |
CITATION_ACCORDION_LABEL_JA,
|
|
|
|
| 68 |
)
|
| 69 |
except Exception:
|
| 70 |
restart_space()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
# Get dataframes
|
|
|
|
| 79 |
FAILED_EVAL_QUEUE_DF,
|
| 80 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
| 81 |
|
| 82 |
+
ORIGINAL_DF = get_leaderboard_df(CONTENTS_REPO, COLS, BENCHMARK_COLS)
|
| 83 |
MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
|
| 84 |
|
| 85 |
|
|
|
|
| 305 |
df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
|
| 306 |
df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
|
| 307 |
df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
|
| 308 |
+
df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"].astype(str) + "-shot)"
|
| 309 |
df = pd.melt(
|
| 310 |
df,
|
| 311 |
id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],
|
src/envs.py
CHANGED
|
@@ -11,14 +11,13 @@ OWNER = "llm-jp" # Change to your org - don't forget to create a results and re
|
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
|
| 13 |
QUEUE_REPO = f"{OWNER}/leaderboard-requests"
|
| 14 |
-
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 21 |
-
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 22 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 23 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 24 |
|
|
|
|
| 11 |
|
| 12 |
REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
|
| 13 |
QUEUE_REPO = f"{OWNER}/leaderboard-requests"
|
| 14 |
+
CONTENTS_REPO = f"{OWNER}/leaderboard-contents"
|
| 15 |
|
| 16 |
# If you setup a cache later, just change HF_HOME
|
| 17 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 18 |
|
| 19 |
# Local caches
|
| 20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
|
| 21 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 22 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 23 |
|
src/leaderboard/read_evals.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
| 1 |
-
import glob
|
| 2 |
-
import json
|
| 3 |
-
import os
|
| 4 |
-
from dataclasses import dataclass
|
| 5 |
-
from decimal import Decimal
|
| 6 |
-
|
| 7 |
-
import dateutil
|
| 8 |
-
|
| 9 |
-
from src.display.formatting import make_clickable_model
|
| 10 |
-
from src.display.utils import AutoEvalColumn, Backend, ModelType, Tasks, Version, WeightType
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
@dataclass
|
| 14 |
-
class EvalResult:
|
| 15 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
| 16 |
-
|
| 17 |
-
eval_name: str # org_model_precision (uid)
|
| 18 |
-
full_model: str # org/model (path on hub)
|
| 19 |
-
org: str
|
| 20 |
-
model: str
|
| 21 |
-
revision: str # commit hash, "" if main
|
| 22 |
-
results: dict
|
| 23 |
-
# precision: Precision = Precision.Unknown
|
| 24 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
| 25 |
-
precision: str = "Unknown"
|
| 26 |
-
# model_type: str = "Unknown"
|
| 27 |
-
weight_type: WeightType = WeightType.Original # Original or Adapter
|
| 28 |
-
architecture: str = "Unknown"
|
| 29 |
-
license: str = "?"
|
| 30 |
-
likes: int = 0
|
| 31 |
-
num_params: int = 0
|
| 32 |
-
date: str = "" # submission date of request file
|
| 33 |
-
num_few_shots: str = "0"
|
| 34 |
-
add_special_tokens: str = ""
|
| 35 |
-
llm_jp_eval_version: str = ""
|
| 36 |
-
vllm_version: str = ""
|
| 37 |
-
backend: str = ""
|
| 38 |
-
|
| 39 |
-
@classmethod
|
| 40 |
-
def init_from_json_file(self, json_filepath):
|
| 41 |
-
"""Inits the result from the specific model result file"""
|
| 42 |
-
with open(json_filepath) as fp:
|
| 43 |
-
data = json.load(fp)
|
| 44 |
-
|
| 45 |
-
config = data.get("config")
|
| 46 |
-
metainfo = config.get("metainfo", {})
|
| 47 |
-
model_config = config.get("model", {})
|
| 48 |
-
|
| 49 |
-
# Get model type from metainfo
|
| 50 |
-
# model_type_str = metainfo.get("model_type", "")
|
| 51 |
-
# model_type = ModelType.from_str(model_type_str)
|
| 52 |
-
# model_type = metainfo.get("model_type", "Unknown")
|
| 53 |
-
|
| 54 |
-
# Get num_few_shots from metainfo
|
| 55 |
-
num_few_shots = str(metainfo.get("num_few_shots", 0))
|
| 56 |
-
|
| 57 |
-
# Precision
|
| 58 |
-
# precision = Precision.from_str(config.get("dtype"))
|
| 59 |
-
precision = model_config.get("dtype", "Unknown")
|
| 60 |
-
|
| 61 |
-
# Add Special Tokens
|
| 62 |
-
add_special_tokens = str(
|
| 63 |
-
config.get("pipeline_kwargs", {"add_special_tokens": "Unknown"}).get("add_special_tokens")
|
| 64 |
-
)
|
| 65 |
-
|
| 66 |
-
version = Version.from_str(metainfo.get("version", "?")).value.name
|
| 67 |
-
|
| 68 |
-
# Get vllm version from metainfo
|
| 69 |
-
vllm_version = metainfo.get("vllm-version", "")
|
| 70 |
-
|
| 71 |
-
backend = Backend.from_str(model_config.get("_target_", "?").split(".")[0]).value.name
|
| 72 |
-
revision = model_config.get("revision", "")
|
| 73 |
-
|
| 74 |
-
# Get model and org
|
| 75 |
-
# org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
|
| 76 |
-
org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
|
| 77 |
-
org_and_model = org_and_model.split("/", 1)
|
| 78 |
-
|
| 79 |
-
# org_and_modelがリストの場合、"/"で結合
|
| 80 |
-
if isinstance(org_and_model, list):
|
| 81 |
-
full_model = "/".join(org_and_model)
|
| 82 |
-
else:
|
| 83 |
-
full_model = org_and_model
|
| 84 |
-
|
| 85 |
-
if len(org_and_model) == 1:
|
| 86 |
-
org = None
|
| 87 |
-
model = org_and_model[0]
|
| 88 |
-
# result_key = f"{model}_{precision.value.name}"
|
| 89 |
-
result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
|
| 90 |
-
else:
|
| 91 |
-
org = org_and_model[0]
|
| 92 |
-
model = org_and_model[1]
|
| 93 |
-
# result_key = f"{org}_{model}_{precision.value.name}"
|
| 94 |
-
result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
|
| 95 |
-
full_model = "/".join(org_and_model)
|
| 96 |
-
|
| 97 |
-
if "scores" not in data:
|
| 98 |
-
raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
|
| 99 |
-
|
| 100 |
-
scores = data["scores"]
|
| 101 |
-
results = {}
|
| 102 |
-
for task in Tasks:
|
| 103 |
-
task_value = task.value
|
| 104 |
-
score = scores.get(task_value.metric)
|
| 105 |
-
results[task_value.metric] = score
|
| 106 |
-
|
| 107 |
-
return self(
|
| 108 |
-
eval_name=result_key,
|
| 109 |
-
full_model=full_model,
|
| 110 |
-
org=org,
|
| 111 |
-
model=model,
|
| 112 |
-
results=results,
|
| 113 |
-
precision=precision,
|
| 114 |
-
revision=revision,
|
| 115 |
-
num_few_shots=num_few_shots,
|
| 116 |
-
add_special_tokens=add_special_tokens,
|
| 117 |
-
llm_jp_eval_version=version,
|
| 118 |
-
vllm_version=vllm_version,
|
| 119 |
-
backend=backend,
|
| 120 |
-
)
|
| 121 |
-
|
| 122 |
-
def update_with_request_file(self, requests_path):
|
| 123 |
-
"""Finds the relevant request file for the current model and updates info with it"""
|
| 124 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
|
| 125 |
-
try:
|
| 126 |
-
with open(request_file, "r") as f:
|
| 127 |
-
request = json.load(f)
|
| 128 |
-
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
| 129 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
| 130 |
-
self.license = request.get("license", "?")
|
| 131 |
-
self.likes = request.get("likes", 0)
|
| 132 |
-
self.num_params = request.get("params", 0)
|
| 133 |
-
self.date = request.get("submitted_time", "")
|
| 134 |
-
self.architecture = request.get("architecture", "?")
|
| 135 |
-
except Exception:
|
| 136 |
-
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision}")
|
| 137 |
-
|
| 138 |
-
def to_dict(self):
|
| 139 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 140 |
-
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 141 |
-
data_dict = {
|
| 142 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
| 143 |
-
AutoEvalColumn.precision.name: self.precision,
|
| 144 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
| 145 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
| 146 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
| 147 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
| 148 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
| 149 |
-
AutoEvalColumn.dummy.name: self.full_model,
|
| 150 |
-
AutoEvalColumn.revision.name: self.revision,
|
| 151 |
-
# AutoEvalColumn.average.name: None,
|
| 152 |
-
AutoEvalColumn.license.name: self.license,
|
| 153 |
-
AutoEvalColumn.likes.name: self.likes,
|
| 154 |
-
AutoEvalColumn.params.name: self.num_params,
|
| 155 |
-
AutoEvalColumn.num_few_shots.name: self.num_few_shots,
|
| 156 |
-
AutoEvalColumn.add_special_tokens.name: self.add_special_tokens,
|
| 157 |
-
AutoEvalColumn.llm_jp_eval_version.name: self.llm_jp_eval_version,
|
| 158 |
-
AutoEvalColumn.vllm_version.name: self.vllm_version,
|
| 159 |
-
AutoEvalColumn.backend.name: self.backend,
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
# for task in Tasks:
|
| 163 |
-
# task_value = task.value
|
| 164 |
-
# data_dict[task_value.col_name] = self.results.get(task_value.benchmark, None)
|
| 165 |
-
for task in Tasks:
|
| 166 |
-
task_value = task.value
|
| 167 |
-
value = self.results.get(task_value.metric)
|
| 168 |
-
data_dict[task_value.col_name] = Decimal(value)
|
| 169 |
-
|
| 170 |
-
return data_dict
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
| 174 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
| 175 |
-
request_files = os.path.join(
|
| 176 |
-
requests_path,
|
| 177 |
-
f"{model_name}_eval_request_*.json",
|
| 178 |
-
)
|
| 179 |
-
request_files = glob.glob(request_files)
|
| 180 |
-
|
| 181 |
-
# Select correct request file (precision)
|
| 182 |
-
request_file = ""
|
| 183 |
-
request_files = sorted(request_files, reverse=True)
|
| 184 |
-
for tmp_request_file in request_files:
|
| 185 |
-
with open(tmp_request_file, "r") as f:
|
| 186 |
-
req_content = json.load(f)
|
| 187 |
-
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
| 188 |
-
request_file = tmp_request_file
|
| 189 |
-
return request_file
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
| 193 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
| 194 |
-
model_result_filepaths = []
|
| 195 |
-
|
| 196 |
-
for root, _, files in os.walk(results_path):
|
| 197 |
-
# We should only have json files in model results
|
| 198 |
-
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
| 199 |
-
continue
|
| 200 |
-
|
| 201 |
-
# Sort the files by date
|
| 202 |
-
try:
|
| 203 |
-
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
| 204 |
-
except dateutil.parser._parser.ParserError:
|
| 205 |
-
files = [files[-1]]
|
| 206 |
-
|
| 207 |
-
for file in files:
|
| 208 |
-
model_result_filepaths.append(os.path.join(root, file))
|
| 209 |
-
|
| 210 |
-
eval_results = {}
|
| 211 |
-
for model_result_filepath in model_result_filepaths:
|
| 212 |
-
# Creation of result
|
| 213 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 214 |
-
eval_result.update_with_request_file(requests_path)
|
| 215 |
-
|
| 216 |
-
# Store results of same eval together
|
| 217 |
-
eval_name = eval_result.eval_name
|
| 218 |
-
if eval_name in eval_results.keys():
|
| 219 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
| 220 |
-
else:
|
| 221 |
-
eval_results[eval_name] = eval_result
|
| 222 |
-
|
| 223 |
-
results = []
|
| 224 |
-
for v in eval_results.values():
|
| 225 |
-
try:
|
| 226 |
-
v.to_dict() # we test if the dict version is complete
|
| 227 |
-
results.append(v)
|
| 228 |
-
except KeyError: # not all eval values present
|
| 229 |
-
continue
|
| 230 |
-
# print(f"Processing file: {model_result_filepath}")
|
| 231 |
-
# print(f"Eval result: {eval_result.to_dict()}")
|
| 232 |
-
|
| 233 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
CHANGED
|
@@ -1,19 +1,40 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
|
|
|
| 6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
| 8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
| 9 |
|
| 10 |
|
| 11 |
-
def get_leaderboard_df(
|
| 12 |
"""Creates a dataframe from all the individual experiment results"""
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
df =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Add a row ID column
|
| 19 |
df[AutoEvalColumn.row_id.name] = range(len(df))
|
|
@@ -32,7 +53,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
| 32 |
existing_score_cols = [col for col in score_cols if col in df.columns]
|
| 33 |
|
| 34 |
# スコア列を100で割り、.4f形式でフォーマット
|
| 35 |
-
df[existing_score_cols] = (df[existing_score_cols] / 100).
|
| 36 |
df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
|
| 37 |
df = df[cols].round(decimals=2)
|
| 38 |
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
+
from decimal import Decimal
|
| 4 |
|
| 5 |
+
import datasets
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
+
from src.about import Tasks
|
| 9 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
| 10 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
+
def get_leaderboard_df(contents_repo: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
| 14 |
"""Creates a dataframe from all the individual experiment results"""
|
| 15 |
+
df = datasets.load_dataset(contents_repo, split="train").to_pandas()
|
| 16 |
+
df["Model"] = df["model"].map(make_clickable_model)
|
| 17 |
+
df["T"] = df["model_type"].map(lambda x: x.split(":")[0].strip())
|
| 18 |
+
df["Type"] = df["model_type"].map(lambda x: x.split(":")[1].strip())
|
| 19 |
+
df["Backend Library"] = "vllm"
|
| 20 |
+
df = df.rename(columns={task.value.metric: task.value.col_name for task in Tasks})
|
| 21 |
+
df = df.rename(
|
| 22 |
+
columns={
|
| 23 |
+
"architecture": "Architecture",
|
| 24 |
+
"weight_type": "Weight type",
|
| 25 |
+
"precision": "Precision",
|
| 26 |
+
"license": "Hub License",
|
| 27 |
+
"params": "#Params (B)",
|
| 28 |
+
"likes": "Hub ❤️",
|
| 29 |
+
"revision": "Revision",
|
| 30 |
+
"num_few_shot": "Few-shot",
|
| 31 |
+
"add_special_tokens": "Add Special Tokens",
|
| 32 |
+
"llm_jp_eval_version": "llm-jp-eval version",
|
| 33 |
+
"vllm_version": "vllm version",
|
| 34 |
+
"model": "model_name_for_query",
|
| 35 |
+
}
|
| 36 |
+
)
|
| 37 |
+
df[[task.value.col_name for task in Tasks]] = df[[task.value.col_name for task in Tasks]].map(lambda x: Decimal(x))
|
| 38 |
|
| 39 |
# Add a row ID column
|
| 40 |
df[AutoEvalColumn.row_id.name] = range(len(df))
|
|
|
|
| 53 |
existing_score_cols = [col for col in score_cols if col in df.columns]
|
| 54 |
|
| 55 |
# スコア列を100で割り、.4f形式でフォーマット
|
| 56 |
+
df[existing_score_cols] = (df[existing_score_cols] / 100).map(lambda x: f"{x:.4f}")
|
| 57 |
df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
|
| 58 |
df = df[cols].round(decimals=2)
|
| 59 |
|