Spaces:
Running
Running
File size: 10,954 Bytes
a147f3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
import os
import json
import glob
import pandas as pd
# Define task, subtask, and dataset mapping
TASK_MAPPING = {
"MasakhaPOS": ("NLU", "POS"),
"MasakhaNER": ("NLU", "NER"),
"AfriSenti": ("NLU", "Senti"),
"NollySenti": ("NLU", "Senti"),
"InjongoIntent": ("NLU", "Intent"),
"MasakhaNEWS": ("NLU", "Topic"),
"SIB": ("NLU", "Topic"),
"AfriHate": ("NLU", "Hate"),
"AfriXNLI": ("NLU", "NLI"),
"AfriQA": ("QA", "XQA"),
"Belebele": ("QA", "RC"),
"NaijaRC": ("QA", "RC"),
"UHURA": ("Knowledge", "Arc-E"),
"OpenAIMMLU": ("Knowledge", "MMLU"),
"AfriMMLU": ("Knowledge", "MMLU"),
"AfriMGSM": ("Reasoning", "Math"),
"SALT - en_xx": ("NLG", "MT(en/fr-xx)"),
"SALT - xx_en": ("NLG", "MT(xx-en/fr)"),
"Flores - en_xx": ("NLG", "MT(en/fr-xx)"),
"Flores - xx_en": ("NLG", "MT(xx-en/fr)"),
"MAFAND - en_xx": ("NLG", "MT(en/fr-xx)"),
"MAFAND - xx_en": ("NLG", "MT(xx-en/fr)"),
"NTREX - en_xx": ("NLG", "MT(en/fr-xx)"),
"NTREX - xx_en": ("NLG", "MT(xx-en/fr)"),
"XLSUM": ("NLG", "SUMM"),
"ADR": ("NLG", "ADR"),
"RC": ("QA", "RC"),
"Sentiment": ("NLU", "Senti"),
"TC": ("NLU", "Topic"),
"MMLU": ("Knowledge", "MMLU"),
"MT - xx-en": ("NLG", "MT(xx-en/fr)"),
"MT - en-xx": ("NLG", "MT(en/fr-xx)"),
}
MODEL_MAP = {
"AfroLlama-V1": "AfroLLaMa 8B",
"LLaMAX3-8B-Alpaca": "LLaMAX3 8B",
"Llama-2-7b-chat-hf": "LLaMa2 7b",
"Llama-3.1-70B-Instruct": "LLaMa3.1 70B",
"Llama-3.1-8B-Instruct": "LLaMa3.1 8B",
"Meta-Llama-3-8B-Instruct": "LLaMa3 8B",
"aya-101": "Aya-101 13B",
"gemma-1.1-7b-it": "Gemma1.1 7b",
"gemma-2-27b-it": "Gemma2 27b",
"gemma-2-9b-it": "Gemma2 9b",
"gemini-1.5-pro-002": "Gemini 1.5 pro",
"gpt-4o-2024-08-06": "GPT-4o (Aug)",
"Gemma 2 IT 27B": "Gemma2 27b",
"Gemma 2 IT 9B": "Gemma2 9b",
"Aya-101": "Aya-101 13B",
"Meta-Llama-3.1-70B-Instruct": "LLaMa3.1 70B",
"LLaMAX3-8B": "LLaMAX3 8B",
"LLaMaX 3 8B": "LLaMAX3 8B",
"Meta-Llama-3-70B-Instruct": "LLaMa3.1 70B"
}
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # this points to /data
results_dir = os.path.join(BASE_DIR, "results")
community_results = os.path.join(BASE_DIR, "community_results")
output_direc = os.path.join(BASE_DIR, "leaderboard_json")
def generate_json_files(results=results_dir, community_result=community_results,
output_dir=output_direc, leaderboard=None):
os.makedirs(output_dir, exist_ok=True)
# Dictionary to store either per-task JSON data or the leaderboard
task_data = {}
leaderboard_data = {}
task_map = {key.lower(): value for key, value in TASK_MAPPING.items()}
afrobench_tasks = ["MasakhaPOS", "MasakhaNER", "Sentiment", "TC", "InjongoIntent", "AfriHate", "AfriXNLI",
"AfriQA", "UHURA", "RC", "MMLU", "AfriMGSM", "MT - en-xx", "MT - xx-en", "XLSUM", "ADR"]
afrobench_tasks = [task.lower() for task in afrobench_tasks]
afrobench_lite_datasets = ["injongointent", "sib", "afrixnli", "belebele", "afrimmlu", "afrimgsm",
"flores - en_xx"]
afrobench_lite_languages = ["amh", "hau", "ibo", "kin", "lin", "lug", "orm", "sna", "sot", "swa", "xho", "yor", "zul", "wol"]
# Process each CSV file
for filename in os.listdir(results):
if filename.endswith(".csv") and '- June2025.csv' not in filename:
file_path = os.path.join(results, filename)
dataset_name = filename.replace(" - 0-shot.csv", "").replace(" 0-shot.csv", "")
# Identify task & subtask
task_info = task_map.get(dataset_name.lower())
if not task_info:
print(f"Skipping unmapped dataset: {dataset_name.lower()}")
continue
task, subtask = task_info
# Read CSV
df = pd.read_csv(file_path)
drop_col = [i for i in df.columns if 'unnamed' in i.lower()]
df.drop(drop_col, axis=1, inplace=True)
# Standardize model names
df.loc[df["model"].str.contains("LLaMaX", case=False), "model"] = "LLaMaX 3 8B"
df = df[df["model"] != "InkubaLM-0.4B"].copy()
df = df[df["model"] != "Claude 3.5 Sonnet"].copy()
df.loc[df["model"].str.contains("gpt", case=False), "model"] = "gpt-4o-2024-08-06"
df.loc[df["model"].str.contains("gemini", case=False), "model"] = "gemini-1.5-pro-002"
df["model"] = df["model"].map(MODEL_MAP)
# Extract models
models = df["model"].unique()
all_columns = list(df.columns)
meta_columns = ["model", "prompt", "avg_score", "avg"]
language_columns = [col for col in all_columns if col not in meta_columns]
language_columns = [col for col in language_columns if col.lower() not in {"eng", "fra",
"eng_latn, fra_latn", "en", "fr"}]
avg_col = "avg" if "avg" in df.columns else "avg_score"
if leaderboard == "afrobench":
# Initialize leaderboard structure
if dataset_name.lower() not in afrobench_tasks:
continue
if task not in leaderboard_data:
leaderboard_data[task] = {}
if subtask not in leaderboard_data[task]:
leaderboard_data[task][subtask] = {"datasets": {}}
# Store per-model dataset scores
dataset_scores = {}
for model in models:
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
scores = [best_avg_row[col] for col in language_columns if col in best_avg_row]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1)
leaderboard_data[task][subtask]["datasets"][dataset_name] = dataset_scores
elif leaderboard == "afrobench_lite":
if 'lite_language_scores' not in locals():
lite_language_scores = {}
if dataset_name in afrobench_lite_datasets:
if subtask not in leaderboard_data:
leaderboard_data[subtask] = {}
# Store per-model dataset scores
dataset_scores = {}
for model in models:
df.fillna(0, inplace=True)
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
scores = [best_avg_row[[c for c in best_avg_row.index if c.split('_')[0] == lang][0]] for lang
in afrobench_lite_languages if
any(c.split('_')[0] == lang for c in best_avg_row.index)]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None,
1) # Avoid division by zero
if model not in lite_language_scores:
lite_language_scores[model] = {}
for lang in afrobench_lite_languages:
if lang in df.columns:
val = df.loc[df["model"] == model, lang].values
if val.size > 0:
lite_language_scores[model].setdefault(lang, []).append(val[0])
df = pd.read_csv(os.path.join(community_result, "New Results - June2025.csv"))
df = df[df['task'] == dataset_name]
df.fillna(0, inplace=True)
models = df["model"].unique()
for model in models:
scores = [df.loc[df["model"] == model, col].values[0] for col in afrobench_lite_languages
if col in df.columns]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1)
if model not in lite_language_scores:
lite_language_scores[model] = {}
for lang in afrobench_lite_languages:
if lang in df.columns:
val = df.loc[df["model"] == model, lang].values
if val.size > 0:
lite_language_scores[model].setdefault(lang, []).append(val[0])
leaderboard_data[subtask][dataset_name] = dataset_scores
else:
# Initialize task & subtask structure
if task not in task_data:
task_data[task] = {"task": task, "subtasks": {}}
if subtask not in task_data[task]["subtasks"]:
task_data[task]["subtasks"][subtask] = {"datasets": {}}
# Store per-task dataset data
task_data[task]["subtasks"][subtask]["datasets"][dataset_name] = {
"languages": language_columns,
"scores": {}
}
for model in models:
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
model_scores = [round(score, 1) for score in best_avg_row[language_columns].to_list()]
task_data[task]["subtasks"][subtask]["datasets"][dataset_name]["scores"][model] = model_scores
# Save leaderboard JSON if enabled
if leaderboard:
output_path = os.path.join(output_dir, f"{leaderboard}.json")
with open(output_path, "w", encoding="utf-8") as json_file:
json.dump(leaderboard_data, json_file, indent=4)
print("Leaderboard JSON generated successfully!")
if leaderboard == "afrobench_lite":
lang_output = os.path.join(output_dir, "lite_language_scores.json")
averaged_scores = {
model: {
lang: round(sum(scores) / len(scores), 1)
for lang, scores in langs.items()
}
for model, langs in lite_language_scores.items()
}
with open(lang_output, "w", encoding="utf-8") as f:
json.dump(averaged_scores, f, indent=4)
print("Saved language version for lite")
# Save per-task JSON files if leaderboard=False
else:
for task, data in task_data.items():
output_path = os.path.join(output_dir, f"{task.lower().replace(' ', '_')}.json")
with open(output_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=4)
print("Task-wise JSON files with subtasks generated successfully!")
generate_json_files(leaderboard="afrobench_lite")
|