Spaces:

llm-jp
/

open-japanese-llm-leaderboard

Running on CPU Upgrade

App Files Files Community

hysts HF Staff commited on Nov 6, 2024

Commit

99a4ea0

1 Parent(s): 560790f

Use preprocessed table dataset (WIP)

Browse files

Files changed (4) hide show

app.py +3 -14
src/envs.py +1 -2
src/leaderboard/read_evals.py +0 -233
src/populate.py +28 -7

app.py CHANGED Viewed

@@ -37,7 +37,7 @@ from src.display.utils import (
     VllmVersion,
     fields,
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO
 from src.i18n import (
     CITATION_ACCORDION_LABEL,
     CITATION_ACCORDION_LABEL_JA,
@@ -68,17 +68,6 @@ try:
     )
 except Exception:
     restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO,
-        local_dir=EVAL_RESULTS_PATH,
-        repo_type="dataset",
-        tqdm_class=None,
-        etag_timeout=30,
-    )
-except Exception:
-    restart_space()
 # Get dataframes
@@ -90,7 +79,7 @@ except Exception:
     FAILED_EVAL_QUEUE_DF,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-ORIGINAL_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
@@ -316,7 +305,7 @@ def plot_size_vs_score(df_filtered: pd.DataFrame) -> go.Figure:
     df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
     df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
     df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
-    df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"] + "-shot)"
     df = pd.melt(
         df,
         id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],

     VllmVersion,
     fields,
 )
+from src.envs import API, CONTENTS_REPO, EVAL_REQUESTS_PATH, QUEUE_REPO, REPO_ID
 from src.i18n import (
     CITATION_ACCORDION_LABEL,
     CITATION_ACCORDION_LABEL_JA,
     )
 except Exception:
     restart_space()
 # Get dataframes
     FAILED_EVAL_QUEUE_DF,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+ORIGINAL_DF = get_leaderboard_df(CONTENTS_REPO, COLS, BENCHMARK_COLS)
 MAX_MODEL_SIZE = ORIGINAL_DF["#Params (B)"].max()
     df = df[["model_name_for_query", "#Params (B)", "Few-shot"] + AVG_COLUMNS]
     df[AVG_COLUMNS] = df[AVG_COLUMNS].astype(float)
     df = df.rename(columns={"model_name_for_query": "Model", "Few-shot": "n-shot"})
+    df["model_name_without_org_name"] = df["Model"].str.split("/").str[-1] + " (" + df["n-shot"].astype(str) + "-shot)"
     df = pd.melt(
         df,
         id_vars=["Model", "model_name_without_org_name", "#Params (B)", "n-shot"],

src/envs.py CHANGED Viewed

@@ -11,14 +11,13 @@ OWNER = "llm-jp"  # Change to your org - don't forget to create a results and re
 REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
 QUEUE_REPO = f"{OWNER}/leaderboard-requests"
-RESULTS_REPO = f"{OWNER}/leaderboard-contents"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
-EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

 REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
 QUEUE_REPO = f"{OWNER}/leaderboard-requests"
+CONTENTS_REPO = f"{OWNER}/leaderboard-contents"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

src/leaderboard/read_evals.py DELETED Viewed

@@ -1,233 +0,0 @@
-import glob
-import json
-import os
-from dataclasses import dataclass
-from decimal import Decimal
-import dateutil
-from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, Backend, ModelType, Tasks, Version, WeightType
-@dataclass
-class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
-    eval_name: str  # org_model_precision (uid)
-    full_model: str  # org/model (path on hub)
-    org: str
-    model: str
-    revision: str  # commit hash, "" if main
-    results: dict
-    # precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
-    precision: str = "Unknown"
-    # model_type: str = "Unknown"
-    weight_type: WeightType = WeightType.Original  # Original or Adapter
-    architecture: str = "Unknown"
-    license: str = "?"
-    likes: int = 0
-    num_params: int = 0
-    date: str = ""  # submission date of request file
-    num_few_shots: str = "0"
-    add_special_tokens: str = ""
-    llm_jp_eval_version: str = ""
-    vllm_version: str = ""
-    backend: str = ""
-    @classmethod
-    def init_from_json_file(self, json_filepath):
-        """Inits the result from the specific model result file"""
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        config = data.get("config")
-        metainfo = config.get("metainfo", {})
-        model_config = config.get("model", {})
-        # Get model type from metainfo
-        # model_type_str = metainfo.get("model_type", "")
-        # model_type = ModelType.from_str(model_type_str)
-        # model_type = metainfo.get("model_type", "Unknown")
-        # Get num_few_shots from metainfo
-        num_few_shots = str(metainfo.get("num_few_shots", 0))
-        # Precision
-        # precision = Precision.from_str(config.get("dtype"))
-        precision = model_config.get("dtype", "Unknown")
-        # Add Special Tokens
-        add_special_tokens = str(
-            config.get("pipeline_kwargs", {"add_special_tokens": "Unknown"}).get("add_special_tokens")
-        )
-        version = Version.from_str(metainfo.get("version", "?")).value.name
-        # Get vllm version from metainfo
-        vllm_version = metainfo.get("vllm-version", "")
-        backend = Backend.from_str(model_config.get("_target_", "?").split(".")[0]).value.name
-        revision = model_config.get("revision", "")
-        # Get model and org
-        # org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
-        org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
-        org_and_model = org_and_model.split("/", 1)
-        # org_and_modelがリストの場合、"/"で結合
-        if isinstance(org_and_model, list):
-            full_model = "/".join(org_and_model)
-        else:
-            full_model = org_and_model
-        if len(org_and_model) == 1:
-            org = None
-            model = org_and_model[0]
-            # result_key = f"{model}_{precision.value.name}"
-            result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
-        else:
-            org = org_and_model[0]
-            model = org_and_model[1]
-            # result_key = f"{org}_{model}_{precision.value.name}"
-            result_key = f"{model}_{precision}_({num_few_shots}shots)_{add_special_tokens}"
-        full_model = "/".join(org_and_model)
-        if "scores" not in data:
-            raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
-        scores = data["scores"]
-        results = {}
-        for task in Tasks:
-            task_value = task.value
-            score = scores.get(task_value.metric)
-            results[task_value.metric] = score
-        return self(
-            eval_name=result_key,
-            full_model=full_model,
-            org=org,
-            model=model,
-            results=results,
-            precision=precision,
-            revision=revision,
-            num_few_shots=num_few_shots,
-            add_special_tokens=add_special_tokens,
-            llm_jp_eval_version=version,
-            vllm_version=vllm_version,
-            backend=backend,
-        )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-            self.architecture = request.get("architecture", "?")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision}")
-    def to_dict(self):
-        """Converts the Eval Result to a dict compatible with our dataframe display"""
-        # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
-        data_dict = {
-            "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.dummy.name: self.full_model,
-            AutoEvalColumn.revision.name: self.revision,
-            # AutoEvalColumn.average.name: None,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.num_few_shots.name: self.num_few_shots,
-            AutoEvalColumn.add_special_tokens.name: self.add_special_tokens,
-            AutoEvalColumn.llm_jp_eval_version.name: self.llm_jp_eval_version,
-            AutoEvalColumn.vllm_version.name: self.vllm_version,
-            AutoEvalColumn.backend.name: self.backend,
-        }
-        # for task in Tasks:
-        #     task_value = task.value
-        #     data_dict[task_value.col_name] = self.results.get(task_value.benchmark, None)
-        for task in Tasks:
-            task_value = task.value
-            value = self.results.get(task_value.metric)
-            data_dict[task_value.col_name] = Decimal(value)
-        return data_dict
-def get_request_file_for_model(requests_path, model_name, precision):
-    """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
-    request_files = os.path.join(
-        requests_path,
-        f"{model_name}_eval_request_*.json",
-    )
-    request_files = glob.glob(request_files)
-    # Select correct request file (precision)
-    request_file = ""
-    request_files = sorted(request_files, reverse=True)
-    for tmp_request_file in request_files:
-        with open(tmp_request_file, "r") as f:
-            req_content = json.load(f)
-            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
-                request_file = tmp_request_file
-    return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
-    """From the path of the results folder root, extract all needed info for results"""
-    model_result_filepaths = []
-    for root, _, files in os.walk(results_path):
-        # We should only have json files in model results
-        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
-            continue
-        # Sort the files by date
-        try:
-            files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
-        except dateutil.parser._parser.ParserError:
-            files = [files[-1]]
-        for file in files:
-            model_result_filepaths.append(os.path.join(root, file))
-    eval_results = {}
-    for model_result_filepath in model_result_filepaths:
-        # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
-        # Store results of same eval together
-        eval_name = eval_result.eval_name
-        if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-        else:
-            eval_results[eval_name] = eval_result
-    results = []
-    for v in eval_results.values():
-        try:
-            v.to_dict()  # we test if the dict version is complete
-            results.append(v)
-        except KeyError:  # not all eval values present
-            continue
-    # print(f"Processing file: {model_result_filepath}")
-    # print(f"Eval result: {eval_result.to_dict()}")
-    return results

src/populate.py CHANGED Viewed

@@ -1,19 +1,40 @@
 import json
 import os
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
-    df = pd.DataFrame.from_records(all_data_json)
     # Add a row ID column
     df[AutoEvalColumn.row_id.name] = range(len(df))
@@ -32,7 +53,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     existing_score_cols = [col for col in score_cols if col in df.columns]
     # スコア列を100で割り、.4f形式でフォーマット
-    df[existing_score_cols] = (df[existing_score_cols] / 100).applymap(lambda x: f"{x:.4f}")
     df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
     df = df[cols].round(decimals=2)

 import json
 import os
+from decimal import Decimal
+import datasets
 import pandas as pd
+from src.about import Tasks
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
+def get_leaderboard_df(contents_repo: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    df = datasets.load_dataset(contents_repo, split="train").to_pandas()
+    df["Model"] = df["model"].map(make_clickable_model)
+    df["T"] = df["model_type"].map(lambda x: x.split(":")[0].strip())
+    df["Type"] = df["model_type"].map(lambda x: x.split(":")[1].strip())
+    df["Backend Library"] = "vllm"
+    df = df.rename(columns={task.value.metric: task.value.col_name for task in Tasks})
+    df = df.rename(
+        columns={
+            "architecture": "Architecture",
+            "weight_type": "Weight type",
+            "precision": "Precision",
+            "license": "Hub License",
+            "params": "#Params (B)",
+            "likes": "Hub ❤️",
+            "revision": "Revision",
+            "num_few_shot": "Few-shot",
+            "add_special_tokens": "Add Special Tokens",
+            "llm_jp_eval_version": "llm-jp-eval version",
+            "vllm_version": "vllm version",
+            "model": "model_name_for_query",
+        }
+    )
+    df[[task.value.col_name for task in Tasks]] = df[[task.value.col_name for task in Tasks]].map(lambda x: Decimal(x))
     # Add a row ID column
     df[AutoEvalColumn.row_id.name] = range(len(df))
     existing_score_cols = [col for col in score_cols if col in df.columns]
     # スコア列を100で割り、.4f形式でフォーマット
+    df[existing_score_cols] = (df[existing_score_cols] / 100).map(lambda x: f"{x:.4f}")
     df = df.sort_values(by=[AutoEvalColumn.AVG.name], ascending=False)
     df = df[cols].round(decimals=2)