llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF Staff commited on Nov 16, 2023

Commit

134a499

1 Parent(s): 04a3faa

updated the llm-perf

Browse files

Files changed (4) hide show

.gitignore +2 -0
app.py +131 -124
requirements.txt +4 -5
src/utils.py +5 -24

.gitignore CHANGED Viewed

@@ -3,3 +3,5 @@ __pycache__/
 .ipynb_checkpoints
 *ipynb
 .vscode/

 .ipynb_checkpoints
 *ipynb
 .vscode/
+dataset/

app.py CHANGED Viewed

@@ -1,88 +1,98 @@
 import os
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-from apscheduler.schedulers.background import BackgroundScheduler
 from src.assets.css_html_js import custom_css
 from src.assets.text_content import (
     TITLE,
-    INTRODUCTION_TEXT,
     ABOUT_TEXT,
     EXAMPLE_CONFIG_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
-from src.utils import (
-    restart_space,
-    load_dataset_repo,
-    process_model_name,
-    process_model_type,
-)
-HARDWARE_NAMES = ["A100-80GB", "RTX4090-24GB"]
-HARDWARES_EMOJIS = ["🖥️", "💻"]
-LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
-OPTIMUM_TOKEN = os.environ.get("OPTIMUM_TOKEN", None)
 ALL_COLUMNS_MAPPING = {
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     "quantization": "Quantization 🗜️",
-    #
-    "weight_class": "Class 🏋️",
-    "model_type": "Type 🤗",
-    #
-    "generate.peak_memory(MB)": "Memory (MB) ⬇️",
-    "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
-    "best_score": "Best Score (%) ⬆️",
-    #
-    "best_scored_model": "Best Scored LLM 🏆",
 }
 ALL_COLUMNS_DATATYPES = [
     "str",
     "str",
     "str",
     "str",
-    #
-    "str",
-    "str",
-    #
     "number",
     "number",
     "number",
-    "str",
-    #
-    "markdown",
-]
-NO_DUPLICATES_COLUMNS = [
-    "backend.name",
-    "backend.torch_dtype",
-    "optimizations",
-    "quantization",
-    #
-    "weight_class",
-    "model_type",
 ]
-SORTING_COLUMN = ["best_score", "generate.latency(s)", "generate.peak_memory(MB)"]
-SORTING_ASCENDING = [False, True, True]
-llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
-def get_benchmark_df(benchmark="Succeeded-1xA100-80GB"):
-    if llm_perf_dataset_repo:
-        llm_perf_dataset_repo.git_pull()
-    # load data
-    benchmark_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
-    clusters_df = pd.read_csv("./llm-perf-dataset/Clustered-Open-LLM-Leaderboard.csv")
     # merge on model
-    merged_df = benchmark_df.merge(
-        clusters_df, left_on="model", right_on="best_scored_model"
-    )
     # transpose energy consumption
     merged_df["generate.energy_consumption(tokens/kWh)"] = (
         1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
@@ -91,38 +101,44 @@ def get_benchmark_df(benchmark="Succeeded-1xA100-80GB"):
     merged_df.loc[
         merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
         "generate.energy_consumption(tokens/kWh)",
-    ] = "N/A"
-    # add optimizations
-    merged_df["optimizations"] = merged_df["backend.bettertransformer"].apply(
-        lambda x: "BetterTransformer" if x else "None"
     )
     # add quantization scheme
-    merged_df["quantization"] = merged_df["backend.quantization_strategy"].apply(
         lambda x: "BnB.4bit" if x == "bnb" else ("GPTQ.4bit" if x == "gptq" else "None")
     )
-    # sort
     merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
-    # drop duplicates
-    merged_df.drop_duplicates(subset=NO_DUPLICATES_COLUMNS, inplace=True)
     return merged_df
 def get_benchmark_table(bench_df):
     copy_df = bench_df.copy()
-    # filter
-    copy_df = copy_df[list(ALL_COLUMNS_MAPPING.keys())]
-    # rename
-    copy_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
     # transform
-    copy_df["Type 🤗"] = copy_df["Type 🤗"].apply(process_model_type)
-    copy_df["Best Scored LLM 🏆"] = copy_df["Best Scored LLM 🏆"].apply(
-        process_model_name
-    )
     # process quantization
-    copy_df["Best Score (%) ⬆️"] = copy_df.apply(
-        lambda x: f"{x['Best Score (%) ⬆️']}**"
         if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
-        else x["Best Score (%) ⬆️"],
         axis=1,
     )
     return copy_df
@@ -130,17 +146,18 @@ def get_benchmark_table(bench_df):
 def get_benchmark_chart(bench_df):
     copy_df = bench_df.copy()
     # filter latency bigger than 100s
-    copy_df = copy_df[copy_df["generate.latency(s)"] <= 100]
-    # rename model_type
-    copy_df["model_type"] = copy_df["model_type"].apply(process_model_type)
     fig = px.scatter(
         copy_df,
-        y="best_score",
-        x="generate.latency(s)",
-        size="generate.peak_memory(MB)",
-        color="model_type",
-        custom_data=list(ALL_COLUMNS_MAPPING.keys()),
         color_discrete_sequence=px.colors.qualitative.Light24,
     )
     fig.update_layout(
@@ -151,17 +168,17 @@ def get_benchmark_chart(bench_df):
             "xanchor": "center",
             "yanchor": "top",
         },
-        xaxis_title="Per 1000 tokens Latency (s)",
-        yaxis_title="Open LLM Score (%)",
-        legend_title="LLM Type",
         width=1200,
         height=600,
     )
     fig.update_traces(
         hovertemplate="<br>".join(
             [
-                f"<b>{ALL_COLUMNS_MAPPING[key]}:</b> %{{customdata[{i}]}}"
-                for i, key in enumerate(ALL_COLUMNS_MAPPING.keys())
             ]
         )
     )
@@ -176,17 +193,17 @@ def filter_query(
     quantization_scheme,
     score,
     memory,
-    hardware,
 ):
-    raw_df = get_benchmark_df(benchmark=f"Succeeded-1x{hardware}")
     filtered_df = raw_df[
-        raw_df["best_scored_model"].str.lower().str.contains(text.lower())
-        & raw_df["backend.name"].isin(backends)
-        & raw_df["backend.torch_dtype"].isin(datatypes)
         & (
             pd.concat(
                 [
-                    raw_df["optimizations"].str.contains(optimization)
                     for optimization in optimizations
                 ],
                 axis=1,
@@ -197,7 +214,7 @@ def filter_query(
         & (
             pd.concat(
                 [
-                    raw_df["quantization"] == quantization
                     for quantization in quantization_scheme
                 ],
                 axis=1,
@@ -205,8 +222,8 @@ def filter_query(
             if len(quantization_scheme) > 0
             else True
         )
-        & (raw_df["best_score"] >= score)
-        & (raw_df["forward.peak_memory(MB)"] <= memory)
     ]
     filtered_table = get_benchmark_table(filtered_df)
     filtered_chart = get_benchmark_chart(filtered_df)
@@ -222,29 +239,29 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
     with gr.Tabs(elem_classes="leaderboard-tabs"):
-        hardware_placeholders = {}
-        hardware_tables = {}
-        hardware_plots = {}
         ####################### HARDWARE TABS #######################
-        for i, (hardware, emoji) in enumerate(zip(HARDWARE_NAMES, HARDWARES_EMOJIS)):
-            # dummy placeholder of the hardware name
-            hardware_placeholders[hardware] = gr.Textbox(value=hardware, visible=False)
-            with gr.TabItem(f"{hardware} {emoji}", id=i):
-                with gr.Tabs(elem_classes="hardware-tabs"):
                     # placeholder for full dataframe
-                    hardware_df = get_benchmark_df(benchmark=f"Succeeded-1x{hardware}")
                     with gr.TabItem("Leaderboard 🏅", id=0):
                         gr.HTML(
                             "👉 Scroll to the right 👉 for additional columns.",
                             elem_id="descriptive-text",
                         )
                         # Original leaderboard table
-                        hardware_tables[hardware] = gr.components.Dataframe(
-                            value=get_benchmark_table(hardware_df),
                             headers=list(ALL_COLUMNS_MAPPING.values()),
                             datatype=ALL_COLUMNS_DATATYPES,
-                            elem_id="hardware-table",
-                            # show_label=False,
                         )
                     with gr.TabItem("Plot 📊", id=1):
                         gr.HTML(
@@ -252,13 +269,13 @@ with demo:
                             elem_id="descriptive-text",
                         )
                         # Original leaderboard plot
-                        hardware_plots[hardware] = gr.components.Plot(
-                            value=get_benchmark_chart(hardware_df),
-                            elem_id="hardware-plot",
                             show_label=False,
                         )
-        ####################### CONTROL PANEL #######################
         with gr.TabItem("Control Panel 🎛️", id=2):
             gr.HTML(
                 "Use this control panel to filter the leaderboard's table and plot.",  # noqa: E501
@@ -328,7 +345,7 @@ with demo:
                     value="Filter 🚀",
                     elem_id="filter-button",
                 )
-            for hardware in HARDWARE_NAMES:
                 filter_button.click(
                     filter_query,
                     [
@@ -339,9 +356,9 @@ with demo:
                         quantization_checkboxes,
                         score_slider,
                         memory_slider,
-                        hardware_placeholders[hardware],
                     ],
-                    [hardware_tables[hardware], hardware_plots[hardware]],
                 )
         ####################### ABOUT TAB #######################
@@ -356,18 +373,8 @@ with demo:
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id="citation-button",
-            ).style(show_copy_button=True)
-# Restart space every hour
-scheduler = BackgroundScheduler()
-scheduler.add_job(
-    restart_space,
-    "interval",
-    seconds=3600,
-    args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
-)
-scheduler.start()
 # Launch demo
-demo.queue(concurrency_count=10).launch()

 import os
 import gradio as gr
 import pandas as pd
 import plotly.express as px
+from huggingface_hub.file_download import hf_hub_download
+from src.utils import process_model_name, process_model_arch
 from src.assets.css_html_js import custom_css
 from src.assets.text_content import (
     TITLE,
     ABOUT_TEXT,
+    INTRODUCTION_TEXT,
     EXAMPLE_CONFIG_TEXT,
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
 )
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
 LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
+MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
 ALL_COLUMNS_MAPPING = {
+    # model
+    "Model": "Model 🤗",
+    "Arch": "Arch 🏛️",
+    "Size": "Size 🏋️",
+    # deployment settings
     "backend.name": "Backend 🏭",
     "backend.torch_dtype": "Dtype 📥",
     "optimizations": "Optimizations 🛠️",
     "quantization": "Quantization 🗜️",
+    # throughput measurements
+    "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
+    "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
+    # latency measurements
+    "forward.latency(s)": "Prefill Latency (s) ⬇️",
+    "generate.latency(s)": "E2E Latency (s) ⬇️",
+    # memory measurements
+    "generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
+    "generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
+    "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
+    # energy measurements
     "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
+    # quality measurements
+    "Score": "Avg Score (%) ⬆️",
 }
+SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
+SORTING_ASCENDING = [False, True]
 ALL_COLUMNS_DATATYPES = [
+    # open llm
+    "markdown",
+    "markdown",
+    "number",
+    # deployment settings
     "str",
     "str",
     "str",
     "str",
+    # measurements
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
+    "number",
     "number",
     "number",
     "number",
 ]
+def get_benchmark_df(machine="hf-dgx-01"):
+    # download data
+    hf_hub_download(
+        repo_id="optimum/llm-perf-dataset",
+        filename="open-llm.csv",
+        local_dir="dataset",
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    hf_hub_download(
+        repo_id="optimum/llm-perf-dataset",
+        filename=f"{machine}/full-report.csv",
+        local_dir="dataset",
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    open_llm = pd.read_csv("dataset/open-llm.csv")
+    full_report = pd.read_csv(f"dataset/{machine}/full-report.csv")
     # merge on model
+    merged_df = open_llm.merge(full_report, left_on="Model", right_on="model")
     # transpose energy consumption
     merged_df["generate.energy_consumption(tokens/kWh)"] = (
         1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
     merged_df.loc[
         merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
         "generate.energy_consumption(tokens/kWh)",
+    ] = pd.NA
+    # add optimizations column
+    merged_df["optimizations"] = merged_df[
+        ["backend.to_bettertransformer", "backend.use_flash_attention_2"]
+    ].apply(
+        lambda x: "BetterTransformer"
+        if x["backend.to_bettertransformer"]
+        else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
+        axis=1,
     )
     # add quantization scheme
+    merged_df["quantization"] = merged_df["backend.quantization_scheme"].apply(
         lambda x: "BnB.4bit" if x == "bnb" else ("GPTQ.4bit" if x == "gptq" else "None")
     )
+    # add decode throughput
+    merged_df["decode.throughput(tokens/s)"] = (
+        1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
+    ).round(2)
+    # sort by metric
     merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
+    # filter columns
+    merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
+    # rename columns
+    merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
     return merged_df
 def get_benchmark_table(bench_df):
     copy_df = bench_df.copy()
     # transform
+    copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
+    copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
     # process quantization
+    copy_df["Avg Score (%) ⬆️"] = copy_df.apply(
+        lambda x: f"{x['Avg Score (%) ⬆️']}**"
         if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
+        else x["Avg Score (%) ⬆️"],
         axis=1,
     )
     return copy_df
 def get_benchmark_chart(bench_df):
     copy_df = bench_df.copy()
+    # transform
+    copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
     # filter latency bigger than 100s
+    # copy_df = copy_df[copy_df["E2E Latency (s) ⬇️"] <= 100]
     fig = px.scatter(
         copy_df,
+        y="Avg Score (%) ⬆️",
+        x="E2E Latency (s) ⬇️",
+        size="Allocated Memory (MB) ⬇️",
+        color="Arch 🏛️",
+        custom_data=list(ALL_COLUMNS_MAPPING.values()),
         color_discrete_sequence=px.colors.qualitative.Light24,
     )
     fig.update_layout(
             "xanchor": "center",
             "yanchor": "top",
         },
+        xaxis_title="Per 1000 Tokens Latency (s)",
+        yaxis_title="Avg Open LLM Score (%)",
+        legend_title="LLM Architecture",
         width=1200,
         height=600,
     )
     fig.update_traces(
         hovertemplate="<br>".join(
             [
+                f"<b>{column}:</b> %{{customdata[{i}]}}"
+                for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
             ]
         )
     )
     quantization_scheme,
     score,
     memory,
+    machine,
 ):
+    raw_df = get_benchmark_df(machine=machine)
     filtered_df = raw_df[
+        raw_df["Model 🤗"].str.contains(text, case=False)
+        & raw_df["Backend 🏭"].isin(backends)
+        & raw_df["Dtype 📥"].isin(datatypes)
         & (
             pd.concat(
                 [
+                    raw_df["Optimizations 🛠️"].str.contains(optimization, case=False)
                     for optimization in optimizations
                 ],
                 axis=1,
         & (
             pd.concat(
                 [
+                    raw_df["Quantization 🗜️"].str.contains(quantization, case=False)
                     for quantization in quantization_scheme
                 ],
                 axis=1,
             if len(quantization_scheme) > 0
             else True
         )
+        & (raw_df["Avg Score (%) ⬆️"] >= score)
+        & (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
     ]
     filtered_table = get_benchmark_table(filtered_df)
     filtered_chart = get_benchmark_chart(filtered_df)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
     with gr.Tabs(elem_classes="leaderboard-tabs"):
+        machine_placeholders = {}
+        machine_tables = {}
+        machine_plots = {}
         ####################### HARDWARE TABS #######################
+        for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
+            # dummy placeholder of the machine name
+            machine_placeholders[machine] = gr.Textbox(value=machine, visible=False)
+            with gr.TabItem(hardware, id=i):
+                with gr.Tabs(elem_classes="machine-tabs"):
                     # placeholder for full dataframe
+                    machine_df = get_benchmark_df(machine=machine)
                     with gr.TabItem("Leaderboard 🏅", id=0):
                         gr.HTML(
                             "👉 Scroll to the right 👉 for additional columns.",
                             elem_id="descriptive-text",
                         )
                         # Original leaderboard table
+                        machine_tables[machine] = gr.components.Dataframe(
+                            value=get_benchmark_table(machine_df),
                             headers=list(ALL_COLUMNS_MAPPING.values()),
                             datatype=ALL_COLUMNS_DATATYPES,
+                            elem_id="machine-table",
                         )
                     with gr.TabItem("Plot 📊", id=1):
                         gr.HTML(
                             elem_id="descriptive-text",
                         )
                         # Original leaderboard plot
+                        machine_plots[machine] = gr.components.Plot(
+                            value=get_benchmark_chart(machine_df),
+                            elem_id="machine-plot",
                             show_label=False,
                         )
+        ###################### CONTROL PANEL #######################
         with gr.TabItem("Control Panel 🎛️", id=2):
             gr.HTML(
                 "Use this control panel to filter the leaderboard's table and plot.",  # noqa: E501
                     value="Filter 🚀",
                     elem_id="filter-button",
                 )
+            for machine in MACHINE_TO_HARDWARE:
                 filter_button.click(
                     filter_query,
                     [
                         quantization_checkboxes,
                         score_slider,
                         memory_slider,
+                        machine_placeholders[machine],
                     ],
+                    [machine_tables[machine], machine_plots[machine]],
                 )
         ####################### ABOUT TAB #######################
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id="citation-button",
+                show_copy_button=True,
+            )
 # Launch demo
+demo.launch(show_api=False)

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-gradio==3.42.0
-APScheduler==3.10.4
-huggingface_hub==0.16.4
-plotly==5.16.1
-pandas==2.1.0

+huggingface_hub
+gradio
+plotly
+pandas

src/utils.py CHANGED Viewed

@@ -17,26 +17,7 @@ def change_tab(query_param):
         return gr.Tabs.update(selected=0)
-def restart_space(LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN):
-    HfApi().restart_space(repo_id=LLM_PERF_LEADERBOARD_REPO, token=OPTIMUM_TOKEN)
-def load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN):
-    llm_perf_dataset_repo = None
-    if OPTIMUM_TOKEN:
-        print("Loading LLM-Perf-Dataset from Hub...")
-        llm_perf_dataset_repo = Repository(
-            local_dir="./llm-perf-dataset",
-            clone_from=LLM_PERF_DATASET_REPO,
-            token=OPTIMUM_TOKEN,
-            repo_type="dataset",
-        )
-        llm_perf_dataset_repo.git_pull()
-    return llm_perf_dataset_repo
-LLM_MODEL_TYPES = {
     # branded ?
     "gpt_bigcode": "GPT-BigCode 🌸",
     "RefinedWebModel": "Falcon 🦅",
@@ -69,8 +50,8 @@ def process_model_name(model_name):
     return model_hyperlink(link, model_name)
-def process_model_type(model_type):
-    if model_type in LLM_MODEL_TYPES:
-        return LLM_MODEL_TYPES[model_type]
     else:
-        return model_type

         return gr.Tabs.update(selected=0)
+LLM_MODEL_ARCHS = {
     # branded ?
     "gpt_bigcode": "GPT-BigCode 🌸",
     "RefinedWebModel": "Falcon 🦅",
     return model_hyperlink(link, model_name)
+def process_model_arch(model_arch):
+    if model_arch in LLM_MODEL_ARCHS:
+        return LLM_MODEL_ARCHS[model_arch]
     else:
+        return model_arch