AfroBench / src /utils.py
JessicaOjo's picture
Upload folder using huggingface_hub (#1)
a147f3f verified
# source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py
import json
import hashlib
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass
import plotly.graph_objects as go
from transformers import AutoConfig
from src.config import afrobench_path, afrobench_lite_path, lite_languages_path
# These classes are for user facing column names, to avoid having to change them
# all around the code when a modif is needed
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
def fields(raw_class):
return [
v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
]
def model_hyperlink(link, model_name):
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
def styled_error(error):
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
def styled_warning(warn):
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
def styled_message(message):
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
def has_no_nan_values(df, columns):
return df[columns].notna().all(axis=1)
def has_nan_values(df, columns):
return df[columns].isna().any(axis=1)
def is_model_on_hub(model_name: str, revision: str) -> bool:
try:
AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
return True, None
except ValueError:
return (
False,
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
)
except Exception as e:
print(f"Could not get the model config from the hub.: {e}")
return False, "was not found on hub!"
def get_color(name):
# Hash and map to a consistent color
color = plt.cm.tab20(hash(name) % 20) # 20 unique colors
return f"rgb({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})"
# def plot_model_scores(df):
# # Assume df already has: Model, Score, and columns you filtered on
# color_map = {
# "LLaMa": "cornflowerblue",
# "Aya": "lightcoral",
# "Gemma": "mediumpurple",
# "GPT": "seagreen",
# "Gemini": "goldenrod",
# "AfroLLaMa": "indianred",
# }
#
# def assign_color(model_name):
# for key, color in color_map.items():
# if key.lower() in model_name.lower():
# return color
# return "gray"
#
# df_sorted = df.copy()
# df_sorted["Color"] = df_sorted["Model"].apply(assign_color)
# df_sorted = df_sorted.sort_values("Score", ascending=False)
#
# fig = go.Figure()
# fig.add_trace(
# go.Bar(
# x=df_sorted["Score"],
# y=df_sorted["Model"],
# orientation='h',
# marker_color=df_sorted["Color"],
# hoverinfo="x+y",
# )
# )
#
# fig.update_layout(
# title="πŸ“Š Model Score Comparison",
# xaxis_title="Average Score",
# yaxis_title="Model",
# height=600,
# margin=dict(l=100, r=20, t=40, b=40),
# )
# return fig
# def plot_model_scores(df):
# df_sorted = df.copy()
# df_sorted["Color"] = df_sorted["Model"].apply(get_color)
#
# fig = go.Figure()
# fig.add_trace(
# go.Bar(
# x=df_sorted["Score"],
# y=df_sorted["Model"],
# orientation='h',
# marker_color=df_sorted["Color"],
# hoverinfo="x+y",
# )
# )
#
# fig.update_layout(
# title="πŸ“Š Model Score Comparison",
# xaxis_title="Average Score",
# yaxis_title="Model",
# height=600,
# margin=dict(l=100, r=20, t=40, b=40),
# )
# return fig
def plot_model_scores(df):
df = df.copy()
df["Color"] = df["Model"].apply(get_color)
# Extract model size as string ("8B", "13B", or "UNK")
def extract_size_str(model):
parts = model.split()
for part in parts:
if part.endswith("B") and part[:-1].isdigit():
return part
return "UNK"
# For plotting: numeric value of size (used only for x-axis)
def size_to_num(size_str):
return int(size_str[:-1]) if size_str != "UNK" else 100
df["Size"] = df["Model"].apply(extract_size_str)
df["Size Num"] = df["Size"].apply(size_to_num)
size_order = df.drop_duplicates("Size").sort_values("Size Num")["Size"].tolist()
fig = go.Figure()
for _, row in df.iterrows():
fig.add_trace(
go.Scatter(
x=[row["Size"]],
y=[row["Score"]],
mode="markers",
name=row["Model"],
marker=dict(
size=14,
color=row["Color"],
line=dict(width=1, color="black"),
),
hovertemplate=f"<b>{row['Model']}</b><br>Score: {row['Score']}<br>Size: {row['Size']}",
showlegend=True,
)
)
fig.update_layout(
title="πŸ“Š Model Score vs Size",
xaxis=dict(
title="Model Size",
type="category",
categoryorder="array",
categoryarray=size_order
),
yaxis_title="Average Score",
height=600,
margin=dict(l=60, r=60, t=40, b=40),
legend=dict(title="Model", orientation="v", x=1.05, y=1),
)
return fig
def plot_leaderboard_scores(view_type, selected_cols, source):
# Load leaderboard data
if source == "afrobench_lite":
df = create_result_dataframes_lite(afrobench_lite_path, level=view_type)
else:
df = create_result_dataframes(afrobench_path, level=view_type)
df.reset_index(inplace=True)
df.rename(columns={"index": "Model"}, inplace=True)
metric_cols = [c for c in df.columns if c not in ["Model"]]
if selected_cols:
metric_cols = [c for c in selected_cols if c in metric_cols]
df["Score"] = df[metric_cols].mean(axis=1).round(1)
df_sorted = df.sort_values("Score", ascending=False)
fig = plot_model_scores(df_sorted)
return fig
def average_nested_scores(score_dict):
return {
model: {k: round(sum(v) / len(v), 1) for k, v in group.items()}
for model, group in score_dict.items()
}
def create_result_dataframes(json_file, level="category"):
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
task_scores = {}
dataset_scores = {}
category_scores = {}
for category, subtasks in data.items():
for task, content in subtasks.items():
for dataset, scores in content["datasets"].items():
for model, score in scores.items():
# Task-level
task_scores.setdefault(model, {}).setdefault(task, []).append(score)
# Dataset-level
dataset_scores.setdefault(model, {})[dataset] = score
# Category-level
category_scores.setdefault(model, {}).setdefault(category, []).append(score)
task_df = pd.DataFrame(average_nested_scores(task_scores)).T.sort_index()
dataset_df = pd.DataFrame(dataset_scores).T.sort_index()
category_df = pd.DataFrame(average_nested_scores(category_scores)).T.sort_index()
return {
"task": task_df,
"dataset": dataset_df,
"category": category_df,
}.get(level, "Invalid level. Choose from: ['category', 'task', 'dataset']")
def create_result_dataframes_lite(json_file, level="task"):
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
# Task-level: average across datasets in each task group
task_scores = {}
dataset_scores = {}
for task, datasets in data.items():
for dataset, scores in datasets.items():
for model, score in scores.items():
dataset_scores.setdefault(model, {})[dataset] = score
task_scores.setdefault(model, {}).setdefault(task, []).append(score)
task_level_df = pd.DataFrame({
model: {task: round(sum(scores) / len(scores), 1) for task, scores in task_dict.items()}
for model, task_dict in task_scores.items()
}).T.sort_index()
dataset_level_df = pd.DataFrame(dataset_scores).T.sort_index()
level_map = {
"task": task_level_df,
"dataset": dataset_level_df,
}
if level == "language":
with open(lite_languages_path, "r", encoding="utf-8") as f:
data = json.load(f)
language_level_df = pd.DataFrame(data).T.sort_index()
level_map["language"] = language_level_df
return level_map.get(level, "Invalid level. Choose from: ['task', 'dataset']")