import pandas as pd import gradio as gr import os import re import requests from dotenv import load_dotenv from matplotlib.colors import LinearSegmentedColormap import plotly.express as px import plotly.graph_objects as go from sklearn.linear_model import LinearRegression import numpy as np from huggingface_hub import HfApi from huggingface_hub.hf_api import HTTPError from huggingface_hub.utils import GatedRepoError from gradio_rangeslider import RangeSlider import datetime load_dotenv() webhook_url = os.environ.get("WEBHOOK_URL") file_name_list = [ "14b", "9b", "7b", "3b", "1b5", "other", ] sheet_name_list = [ "cr", "bpc", "bpb", ] metric_list = [ "Compression Rate (%)", "Bits Per Character (BPC)", "Bits Per Byte (BPB)", ] model_size_list = [ "~14B", "~9B", "~7B", "~3B", "~1.5B", "Other", ] metric_to_sheet = { "Compression Rate (%)": "cr", "Bits Per Character (BPC)": "bpc", "Bits Per Byte (BPB)": "bpb", } model_size_to_file_name = { "~14B": "14b", "~9B": "9b", "~7B": "7b", "~3B": "3b", "~1.5B": "1b5", "Other": "other", } def read_about_md(): with open('about.md', 'r', encoding='utf-8') as f: return f.read() def rename_columns(df): df.columns = [col.rsplit("_", maxsplit=1)[0] for col in df.columns] return df def get_folders_matching_format(directory): pattern = re.compile(r"^\d{4}-\d{2}$") folders = [] if not os.path.exists(directory): return folders for item in os.listdir(directory): full_path = os.path.join(directory, item) if os.path.isdir(full_path) and pattern.match(item): folders.append(full_path) return folders def get_unique_column_names(data=None): return [ "ao3_\u200benglish", "bbc_\u200bnews", "wikipedia_\u200benglish", "arxiv_\u200bcomputer_\u200bscience", "arxiv_\u200bphysics", "github_\u200bcpp", "github_\u200bpython", ] def color_cell(value): return "background-color: #fffdd0" if pd.notna(value) else "default" def update_table( period: str, models_size: list, metric: str, visible_columns: list, color_columns: list, size_range: list, midpoint: float = 0.5, sort_by: str = "Average (lower=better)", ascending: bool = True, ): print( f"Updating - time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, period: {period}, models: {models_size}, metric: {metric}, visible_columns: {visible_columns}, color_columns: {color_columns}, size_range: {size_range}, sort_by: {sort_by}, ascending: {ascending}\n" ) if not models_size: return "No data available for the selected models and period." # return pd.DataFrame() target_period_data = all_data[period] target_file_name = [model_size_to_file_name[model] for model in models_size] sheet_name = metric_to_sheet[metric] # combined_data = pd.concat([target_period_data[file_name][sheet_name] for file_name in target_file_name], axis=0) combined_data = pd.concat( [df.dropna(axis=1, how="all") for df in [target_period_data[file_name][sheet_name] for file_name in target_file_name]], axis=0 ) if len(combined_data) == 0: return "No data available for the selected models and period." # return pd.DataFrame() # Filter models based on the size range combined_data = combined_data[combined_data["Parameters Count (B)"].between(size_range[0], size_range[1])] combined_data.reset_index(drop=True, inplace=True) if len(combined_data) == 0: return "No data available for the selected models and period." # return pd.DataFrame() combined_data["Name"] = combined_data["Name"].apply(lambda x: x.replace(".pth", "")) ordered_columns = get_unique_column_names() relevant_columns = [col for col in ordered_columns if col in visible_columns and col not in ["Name", "Parameters Count (B)", "Average (The lower the better)"]] if len(combined_data) > 0: combined_data["Average (The lower the better)"] = round(combined_data[relevant_columns].mean(axis=1), 3) combined_data = combined_data.rename(columns={"Parameters Count (B)": "Params (B)"}) combined_data = combined_data.rename(columns={"Average (The lower the better)": "Average (lower=better)"}) sorted_data = combined_data.sort_values(by=sort_by, ascending=ascending) visible_columns = ["Name", "Params (B)", "Average (lower=better)"] + relevant_columns filtered_data = sorted_data[visible_columns] filtered_data.columns = [col.replace("_", " ") for col in filtered_data.columns] formatter = {col: "{:.3f}" for col in filtered_data.columns if filtered_data[col].dtype in ["float64", "float32"]} # color gradient colors = ["#63be7b", "#ffffff", "#f8696b"] vmin = {} vmax = {} vmid = {} for column in filtered_data.columns: if column in ["Name", "Params (B)"]: continue col_values = filtered_data[column] if len(col_values) > 1: sorted_values = np.sort(col_values) vmin[column] = sorted_values.min() vmax[column] = sorted_values.max() idx = int(len(sorted_values) * midpoint) vmid[column] = sorted_values[idx] def custom_background_gradient(series, cmap, vmin, vmax, vmid): if len(series) == 0: return series def normalize(x): if x <= vmid: return 0.5 * (x - vmin) / (vmid - vmin) else: return 0.5 + 0.5 * (x - vmid) / (vmax - vmid) normed = series.apply(normalize) colors = [cmap(x) for x in normed] return ["background-color: rgba({}, {}, {}, {})".format(*[int(255 * x) for x in c[:3]], c[3]) for c in colors] target_color_columns = [] if "Average" in color_columns: target_color_columns.append("Average (lower=better)") if "Individual Tests" in color_columns: target_color_columns.extend([col for col in filtered_data.columns if col not in ["Name", "Params (B)", "Average (lower=better)"]]) styler = filtered_data.style.format(formatter).map(color_cell, subset=["Params (B)"]) for column in target_color_columns: styler = styler.apply( custom_background_gradient, cmap=LinearSegmentedColormap.from_list("custom_cmap", colors), vmin=vmin[column], vmax=vmax[column], vmid=vmid[column], subset=[column], ) # return styler styler = styler.hide(axis="index") widths = [300, 150, 150, 100, 100, 100, 100, 100, 100, 100, 100] table_styles = [] for i, w in enumerate(widths): table_styles.append( { "selector": "th", "props": [ ("background-color", "#f5f5f5"), ("padding", "8px"), ("font-weight", "bold"), ], } ) table_styles.append( { "selector": f"th.col{i}", "props": [ ("min-width", f"{w}px"), ("max-width", f"{w}px"), ("text-align", "center"), ("border", "1px solid #dddddd"), ], } ) table_styles.append( { "selector": f"td.col{i}", "props": [ ("min-width", f"{w}px"), ("max-width", f"{w}px"), ("text-align", "center"), ("border", "1px solid #dddddd"), ], } ) table_styles.append( { "selector": "table", "props": [ ("border-collapse", "collapse"), ("border", "1px solid #dddddd"), ], } ) styler = styler.set_table_styles(table_styles) html_output = styler.to_html() return html_output def create_world_languages_gdp_chart(): languages = ["English", "Chinese", "Spanish", "Japanese", "German", "French", "Arabic", "Italian", "Portuguese", "Korean", "Other"] shares = [27, 18, 8, 6, 5, 4, 3, 2, 2, 2, 23] colors = ["#FF7F7F", "#FFA07A", "#FFDB58", "#90EE90", "#98FB98", "#87CEFA", "#B0C4DE", "#DDA0DD", "#D8BFD8", "#F0E68C", "#E0FFFF"] fig = go.Figure( data=[ go.Pie( labels=languages, values=shares, hole=0.3, marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)), textinfo="label+percent", textposition="outside", insidetextorientation="radial", textfont=dict(size=12), ) ] ) fig.update_layout( title={ "text": "World Languages by Share of Global GDP", "y": 0.95, "x": 0.5, "xanchor": "center", "yanchor": "top", "font": dict(size=20, color="black"), }, showlegend=False, width=700, height=500, margin=dict(t=80, b=20, l=20, r=20), ) return fig def check_model_exists(model_id): api = HfApi() try: model_info = api.model_info(model_id) return "Exists and is accessible" except GatedRepoError: return "Exists but is restricted" except HTTPError as e: if e.response.status_code == 404: return "Does not exist" else: return "Error: " + str(e) def submit_model(name): if "Exists" not in check_model_exists(name): return f"# ERROR: Model {name} does not exist on Hugging Face!" try: response = requests.post(webhook_url, json={"content": name}) if response.status_code == 200: response_data = response.json() if response_data.get("status") == "success": return "# SUCCESS: We will check the model as soon as possible. Thank you for your submission!" else: return f"# ERROR: {response_data.get('message', 'Unknown error')}" else: return f"# ERROR: Failed to submit model {name}. Server returned status code {response.status_code}." except requests.exceptions.HTTPError: return "# ERROR: Network error while contacting queue. Please try again in a few minutes." except Exception as e: print(e) return "ERROR: Unexpected error. Please try again later." # def create_scaling_plot(all_data, period): # selected_columns = ["Name", "Parameters Count (B)", "Average (The lower the better)"] # target_data = all_data[period] # new_df = pd.DataFrame() # for size in target_data.keys(): # new_df = pd.concat([new_df, target_data[size]["cr"].loc[:, selected_columns].dropna(axis=1, how="all")], axis=0) # new_df.rename(columns={"Parameters Count (B)": "Params(B)", "Average (The lower the better)": "Compression Rate (%)"}, inplace=True) # new_df["Log Params(B)"] = np.log(new_df["Params(B)"]) # new_df["Log Compression Rate (%)"] = np.log(new_df["Compression Rate (%)"]) # fig = px.scatter( # new_df, # x="Log Params(B)", # y="Log Compression Rate (%)", # title="Compression Rate Scaling Law", # hover_name="Name", # custom_data=["Params(B)", "Compression Rate (%)"], # ) # fig.update_traces( # hovertemplate="%{hovertext}
Params(B): %{customdata[0]:.2f} B
Compression Rate (%): %{customdata[1]:.2f}" # ) # fig.update_layout( # width=800, # 设置图像宽度 # height=600, # 设置图像高度 # title={"text": "Compression Rate Scaling Law", "x": 0.5, "xanchor": "center", "yanchor": "top"}, # showlegend=True, # xaxis={"showgrid": True, "zeroline": False, "type": "linear", "title": "Params(B)"}, # 确保坐标轴类型正确 # yaxis={"showgrid": True, "zeroline": False, "type": "linear", "title": "Compression Rate (%)", "autorange": "reversed"}, # ) # names_to_connect_dict = { # "2024-05": ["Meta-Llama-3-8B", "stablelm-3b-4e1t", "Qwen2-1.5B", "TinyLlama-1.1B-intermediate-step-1431k-3T", "Mistral-Nemo-Base-2407"], # "2024-06": ["Meta-Llama-3-8B", "stablelm-3b-4e1t", "Qwen2-1.5B", "TinyLlama-1.1B-intermediate-step-1431k-3T", "Mistral-Nemo-Base-2407"], # "2024-07": ["Meta-Llama-3.1-8B", "stablelm-3b-4e1t", "Qwen2-1.5B", "TinyLlama-1.1B-intermediate-step-1431k-3T", "Mistral-Nemo-Base-2407"], # "2024-08": [ # "Meta-Llama-3.1-8B", # "Rene-v0.1-1.3b-pytorch", # "stablelm-3b-4e1t", # "Qwen2-1.5B", # "TinyLlama-1.1B-intermediate-step-1431k-3T", # "Mistral-Nemo-Base-2407", # ], # "2025-01": ["Qwen2.5-1.5B"], # } # names_to_connect = names_to_connect_dict.get(period, names_to_connect_dict["2024-08"]) # connection_points = new_df[new_df["Name"].isin(names_to_connect)] # print(connection_points) # new_df["Color"] = new_df["Name"].apply(lambda name: "#39C5BB" if name in names_to_connect else "#636efa") # fig.update_traces(marker=dict(color=new_df["Color"])) # X = connection_points["Log Params(B)"].values.reshape(-1, 1) # y = connection_points["Log Compression Rate (%)"].values # model = LinearRegression().fit(X, y) # x_min = connection_points["Log Params(B)"].min() # x_max = connection_points["Log Params(B)"].max() # extended_x = np.linspace(x_min, x_max * 1.5, 100) # extended_x_original = np.exp(extended_x) # trend_line_y = model.predict(extended_x.reshape(-1, 1)) # trend_line_y_original = np.exp(trend_line_y) # trend_line = go.Scatter( # x=extended_x, # y=trend_line_y, # mode="lines", # line=dict(color="skyblue", dash="dash"), # name="Trend Line", # hovertemplate="Params(B): %{customdata[0]:.2f}
" + "Compression Rate (%): %{customdata[1]:.2f}", # customdata=np.stack((extended_x_original, trend_line_y_original), axis=-1), # ) # fig.add_trace(trend_line) # x_min = new_df["Params(B)"].min() # x_max = new_df["Params(B)"].max() # x_tick_vals = np.geomspace(x_min, x_max, num=5) # x_tick_text = [f"{val:.1f}" for val in x_tick_vals] # y_min = new_df["Compression Rate (%)"].min() # y_max = new_df["Compression Rate (%)"].max() # y_tick_vals = np.geomspace(y_min, y_max, num=5) # y_tick_text = [f"{val:.1f}" for val in y_tick_vals] # fig.update_xaxes(tickvals=np.log(x_tick_vals), ticktext=x_tick_text, title="Params(B)") # fig.update_yaxes(tickvals=np.log(y_tick_vals), ticktext=y_tick_text, title="Compression Rate (%)", autorange="reversed") # fig.update_layout(xaxis=dict(showgrid=True, zeroline=False), yaxis=dict(showgrid=True, zeroline=False)) # fig.update_traces(marker=dict(size=12)) # print(fig.layout) # return fig def create_scaling_plot(all_data, period): selected_columns = ["Name", "Parameters Count (B)", "Average (The lower the better)"] target_data = all_data[period] new_df = pd.DataFrame() for size in target_data.keys(): new_df = pd.concat([new_df, target_data[size]["cr"].loc[:, selected_columns].dropna(axis=1, how="all")], axis=0) x_values = new_df["Parameters Count (B)"].astype(float).tolist() y_values = new_df["Average (The lower the better)"].astype(float).tolist() names = new_df["Name"].tolist() x_min, x_max = np.log10(min(x_values)), np.log10(max(x_values)) y_min, y_max = np.log10(min(y_values)), np.log10(max(y_values)) x_dtick = (x_max - x_min) / 4 y_dtick = (y_max - y_min) / 4 fig = go.Figure() fig.add_trace( go.Scatter( x=x_values, y=y_values, mode="markers", name="Models", marker=dict(size=12, color="#39C5BB", opacity=0.8), text=names, customdata=list(zip(x_values, y_values)), hovertemplate=( "%{text}
" + "Params: %{customdata[0]:.2f}B
" + "Compression Rate: %{customdata[1]:.2f}%
" + "" ), ) ) fig.update_layout( title={"text": "Compression Rate Scaling Law", "x": 0.5, "xanchor": "center", "yanchor": "top"}, width=800, height=600, showlegend=True, xaxis=dict( title="Parameters (B)", showgrid=True, zeroline=False, type="log", dtick=x_dtick, tickformat=".2f", range=[x_min - 0.1, x_max + 0.1], ), yaxis=dict( title="Compression Rate (%)", showgrid=True, zeroline=False, type="log", dtick=y_dtick, tickformat=".2f", range=[y_min - 0.1, y_max + 0.1], autorange="reversed", ), ) return fig def read_all_data(folder_name): all_data = {} time_list = [] for folder in get_folders_matching_format(folder_name): folder_name = os.path.basename(folder) time_list.append(folder_name) if all_data.get(folder) is None: all_data[folder_name] = {} for file_name in file_name_list: if all_data.get(file_name) is None: all_data[folder_name][file_name] = {} for sheet_name in sheet_name_list: final_file_name = os.path.join(folder, file_name) all_data[folder_name][file_name][sheet_name] = rename_columns(pd.read_excel(final_file_name + ".xlsx", sheet_name=sheet_name)) return all_data, time_list # def read_mutilange_data(folder_path='mutilang_data'): # mutilange_data = {} # excel_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.xlsx')] # time_list = [file.split('.')[0] for file in excel_files] # time_list = [x.split('\\')[-1] for x in time_list] # for file_name in excel_files: # if mutilange_data.get(file_name) is None: # mutilange_data[file_name] = {} # for sheet_name in sheet_name_list: # mutilange_data[file_name][sheet_name] = rename_columns( # pd.read_excel(file_name, sheet_name=sheet_name)) # return mutilange_data, time_list all_data, time_list = read_all_data("data") # muti_lang_data, muti_lang_time_list = read_mutilange_data() time_list.sort() last_period = time_list[-1] initial_fig = create_scaling_plot(all_data, last_period) initial_metric = metric_list[0] initial_columns = get_unique_column_names(all_data) initial_colors = ["Average", "Individual Tests"] initial_size_range = [0, 40] initial_data = update_table(last_period, model_size_list, initial_metric, initial_columns, initial_colors, initial_size_range) css = """ .gradio-container { max-width: 95% !important; margin: 0 auto; } .tab-buttons button { font-size: 1.3em; } .gr-dataframe th { white-space: normal; word-break: break-word; } table { margin-left: auto !important; margin-right: auto !important; width: 100% !important; } """ TITLE_HTML = '

🏆 LLM Compression Leaderboard

' SUBTITLE_HTML = "

Welcome to Uncheatable Eval LLM Compression Leaderboard, where fancy fine-tuning and cheating won't work 🚫; only compute 💻, data 📊, and real innovation 🔥 can prevail!

" with gr.Blocks(css=css) as demo: gr.HTML(TITLE_HTML) gr.HTML(SUBTITLE_HTML) with gr.Tabs() as tabs: with gr.Tab("🏆 Leaderboard"): with gr.Row(): with gr.Column(): period_selector = gr.Dropdown(label="Period", choices=time_list, value=last_period) model_selector = gr.CheckboxGroup(label="Model Size", choices=model_size_list, value=model_size_list) size_range_slider = RangeSlider(minimum=0, maximum=40, value=[0, 40], step=0.1, label="Model Size Range") metric_selector = gr.Dropdown(label="Metric", choices=metric_list, value=initial_metric) with gr.Column(): midpoint_slider = gr.Slider(minimum=0.1, maximum=0.9, value=0.5, step=0.01, label="Color Gradient Midpoint") color_selector = gr.CheckboxGroup(label="Colored Columns", choices=["Average", "Individual Tests"], value=initial_colors) colfilter = gr.CheckboxGroup(label="Data Source", choices=get_unique_column_names(all_data), value=initial_columns) table = gr.HTML(initial_data) period_selector.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) model_selector.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) metric_selector.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) colfilter.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) color_selector.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) size_range_slider.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) midpoint_slider.change( update_table, inputs=[period_selector, model_selector, metric_selector, colfilter, color_selector, size_range_slider, midpoint_slider], outputs=table, ) with gr.Tab("🌍 MultiLang"): gr.Markdown("## Coming soon...") world_languages_plot = gr.Plot(create_world_languages_gdp_chart()) with gr.Tab("📈 Scaling Law"): period_selector_2 = gr.Dropdown(label="Period", choices=time_list, value=last_period) def update_plot(period): new_fig = create_scaling_plot(all_data, period) return new_fig plot = gr.Plot(initial_fig) period_selector_2.change(update_plot, inputs=period_selector_2, outputs=plot) with gr.Tab("ℹ️ About"): gr.Markdown(read_about_md()) with gr.Tab("🚀 Submit"): with gr.Group(): with gr.Row(): model_name = gr.Textbox(max_lines=1, placeholder="Enter model name...", show_label=False, scale=4) submit = gr.Button("Submit", variant="primary", scale=0) output = gr.Markdown("# Enter a public HF repo id, then hit Submit to add it to the evaluation queue.") submit.click(fn=submit_model, inputs=model_name, outputs=output) demo.launch(share=False)