Spaces:
Running
Running
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py | |
import json | |
import hashlib | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from dataclasses import dataclass | |
import plotly.graph_objects as go | |
from transformers import AutoConfig | |
from src.config import afrobench_path, afrobench_lite_path, lite_languages_path | |
# These classes are for user facing column names, to avoid having to change them | |
# all around the code when a modif is needed | |
class ColumnContent: | |
name: str | |
type: str | |
displayed_by_default: bool | |
hidden: bool = False | |
def fields(raw_class): | |
return [ | |
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__" | |
] | |
def model_hyperlink(link, model_name): | |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' | |
def styled_error(error): | |
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>" | |
def styled_warning(warn): | |
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>" | |
def styled_message(message): | |
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>" | |
def has_no_nan_values(df, columns): | |
return df[columns].notna().all(axis=1) | |
def has_nan_values(df, columns): | |
return df[columns].isna().any(axis=1) | |
def is_model_on_hub(model_name: str, revision: str) -> bool: | |
try: | |
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False) | |
return True, None | |
except ValueError: | |
return ( | |
False, | |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", | |
) | |
except Exception as e: | |
print(f"Could not get the model config from the hub.: {e}") | |
return False, "was not found on hub!" | |
def get_color(name): | |
# Hash and map to a consistent color | |
color = plt.cm.tab20(hash(name) % 20) # 20 unique colors | |
return f"rgb({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})" | |
# def plot_model_scores(df): | |
# # Assume df already has: Model, Score, and columns you filtered on | |
# color_map = { | |
# "LLaMa": "cornflowerblue", | |
# "Aya": "lightcoral", | |
# "Gemma": "mediumpurple", | |
# "GPT": "seagreen", | |
# "Gemini": "goldenrod", | |
# "AfroLLaMa": "indianred", | |
# } | |
# | |
# def assign_color(model_name): | |
# for key, color in color_map.items(): | |
# if key.lower() in model_name.lower(): | |
# return color | |
# return "gray" | |
# | |
# df_sorted = df.copy() | |
# df_sorted["Color"] = df_sorted["Model"].apply(assign_color) | |
# df_sorted = df_sorted.sort_values("Score", ascending=False) | |
# | |
# fig = go.Figure() | |
# fig.add_trace( | |
# go.Bar( | |
# x=df_sorted["Score"], | |
# y=df_sorted["Model"], | |
# orientation='h', | |
# marker_color=df_sorted["Color"], | |
# hoverinfo="x+y", | |
# ) | |
# ) | |
# | |
# fig.update_layout( | |
# title="π Model Score Comparison", | |
# xaxis_title="Average Score", | |
# yaxis_title="Model", | |
# height=600, | |
# margin=dict(l=100, r=20, t=40, b=40), | |
# ) | |
# return fig | |
# def plot_model_scores(df): | |
# df_sorted = df.copy() | |
# df_sorted["Color"] = df_sorted["Model"].apply(get_color) | |
# | |
# fig = go.Figure() | |
# fig.add_trace( | |
# go.Bar( | |
# x=df_sorted["Score"], | |
# y=df_sorted["Model"], | |
# orientation='h', | |
# marker_color=df_sorted["Color"], | |
# hoverinfo="x+y", | |
# ) | |
# ) | |
# | |
# fig.update_layout( | |
# title="π Model Score Comparison", | |
# xaxis_title="Average Score", | |
# yaxis_title="Model", | |
# height=600, | |
# margin=dict(l=100, r=20, t=40, b=40), | |
# ) | |
# return fig | |
def plot_model_scores(df): | |
df = df.copy() | |
df["Color"] = df["Model"].apply(get_color) | |
# Extract model size as string ("8B", "13B", or "UNK") | |
def extract_size_str(model): | |
parts = model.split() | |
for part in parts: | |
if part.endswith("B") and part[:-1].isdigit(): | |
return part | |
return "UNK" | |
# For plotting: numeric value of size (used only for x-axis) | |
def size_to_num(size_str): | |
return int(size_str[:-1]) if size_str != "UNK" else 100 | |
df["Size"] = df["Model"].apply(extract_size_str) | |
df["Size Num"] = df["Size"].apply(size_to_num) | |
size_order = df.drop_duplicates("Size").sort_values("Size Num")["Size"].tolist() | |
fig = go.Figure() | |
for _, row in df.iterrows(): | |
fig.add_trace( | |
go.Scatter( | |
x=[row["Size"]], | |
y=[row["Score"]], | |
mode="markers", | |
name=row["Model"], | |
marker=dict( | |
size=14, | |
color=row["Color"], | |
line=dict(width=1, color="black"), | |
), | |
hovertemplate=f"<b>{row['Model']}</b><br>Score: {row['Score']}<br>Size: {row['Size']}", | |
showlegend=True, | |
) | |
) | |
fig.update_layout( | |
title="π Model Score vs Size", | |
xaxis=dict( | |
title="Model Size", | |
type="category", | |
categoryorder="array", | |
categoryarray=size_order | |
), | |
yaxis_title="Average Score", | |
height=600, | |
margin=dict(l=60, r=60, t=40, b=40), | |
legend=dict(title="Model", orientation="v", x=1.05, y=1), | |
) | |
return fig | |
def plot_leaderboard_scores(view_type, selected_cols, source): | |
# Load leaderboard data | |
if source == "afrobench_lite": | |
df = create_result_dataframes_lite(afrobench_lite_path, level=view_type) | |
else: | |
df = create_result_dataframes(afrobench_path, level=view_type) | |
df.reset_index(inplace=True) | |
df.rename(columns={"index": "Model"}, inplace=True) | |
metric_cols = [c for c in df.columns if c not in ["Model"]] | |
if selected_cols: | |
metric_cols = [c for c in selected_cols if c in metric_cols] | |
df["Score"] = df[metric_cols].mean(axis=1).round(1) | |
df_sorted = df.sort_values("Score", ascending=False) | |
fig = plot_model_scores(df_sorted) | |
return fig | |
def average_nested_scores(score_dict): | |
return { | |
model: {k: round(sum(v) / len(v), 1) for k, v in group.items()} | |
for model, group in score_dict.items() | |
} | |
def create_result_dataframes(json_file, level="category"): | |
with open(json_file, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
task_scores = {} | |
dataset_scores = {} | |
category_scores = {} | |
for category, subtasks in data.items(): | |
for task, content in subtasks.items(): | |
for dataset, scores in content["datasets"].items(): | |
for model, score in scores.items(): | |
# Task-level | |
task_scores.setdefault(model, {}).setdefault(task, []).append(score) | |
# Dataset-level | |
dataset_scores.setdefault(model, {})[dataset] = score | |
# Category-level | |
category_scores.setdefault(model, {}).setdefault(category, []).append(score) | |
task_df = pd.DataFrame(average_nested_scores(task_scores)).T.sort_index() | |
dataset_df = pd.DataFrame(dataset_scores).T.sort_index() | |
category_df = pd.DataFrame(average_nested_scores(category_scores)).T.sort_index() | |
return { | |
"task": task_df, | |
"dataset": dataset_df, | |
"category": category_df, | |
}.get(level, "Invalid level. Choose from: ['category', 'task', 'dataset']") | |
def create_result_dataframes_lite(json_file, level="task"): | |
with open(json_file, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
# Task-level: average across datasets in each task group | |
task_scores = {} | |
dataset_scores = {} | |
for task, datasets in data.items(): | |
for dataset, scores in datasets.items(): | |
for model, score in scores.items(): | |
dataset_scores.setdefault(model, {})[dataset] = score | |
task_scores.setdefault(model, {}).setdefault(task, []).append(score) | |
task_level_df = pd.DataFrame({ | |
model: {task: round(sum(scores) / len(scores), 1) for task, scores in task_dict.items()} | |
for model, task_dict in task_scores.items() | |
}).T.sort_index() | |
dataset_level_df = pd.DataFrame(dataset_scores).T.sort_index() | |
level_map = { | |
"task": task_level_df, | |
"dataset": dataset_level_df, | |
} | |
if level == "language": | |
with open(lite_languages_path, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
language_level_df = pd.DataFrame(data).T.sort_index() | |
level_map["language"] = language_level_df | |
return level_map.get(level, "Invalid level. Choose from: ['task', 'dataset']") | |