Spaces:
Running
Running
import os | |
import json | |
import glob | |
import pandas as pd | |
# Define task, subtask, and dataset mapping | |
TASK_MAPPING = { | |
"MasakhaPOS": ("NLU", "POS"), | |
"MasakhaNER": ("NLU", "NER"), | |
"AfriSenti": ("NLU", "Senti"), | |
"NollySenti": ("NLU", "Senti"), | |
"InjongoIntent": ("NLU", "Intent"), | |
"MasakhaNEWS": ("NLU", "Topic"), | |
"SIB": ("NLU", "Topic"), | |
"AfriHate": ("NLU", "Hate"), | |
"AfriXNLI": ("NLU", "NLI"), | |
"AfriQA": ("QA", "XQA"), | |
"Belebele": ("QA", "RC"), | |
"NaijaRC": ("QA", "RC"), | |
"UHURA": ("Knowledge", "Arc-E"), | |
"OpenAIMMLU": ("Knowledge", "MMLU"), | |
"AfriMMLU": ("Knowledge", "MMLU"), | |
"AfriMGSM": ("Reasoning", "Math"), | |
"SALT - en_xx": ("NLG", "MT(en/fr-xx)"), | |
"SALT - xx_en": ("NLG", "MT(xx-en/fr)"), | |
"Flores - en_xx": ("NLG", "MT(en/fr-xx)"), | |
"Flores - xx_en": ("NLG", "MT(xx-en/fr)"), | |
"MAFAND - en_xx": ("NLG", "MT(en/fr-xx)"), | |
"MAFAND - xx_en": ("NLG", "MT(xx-en/fr)"), | |
"NTREX - en_xx": ("NLG", "MT(en/fr-xx)"), | |
"NTREX - xx_en": ("NLG", "MT(xx-en/fr)"), | |
"XLSUM": ("NLG", "SUMM"), | |
"ADR": ("NLG", "ADR"), | |
"RC": ("QA", "RC"), | |
"Sentiment": ("NLU", "Senti"), | |
"TC": ("NLU", "Topic"), | |
"MMLU": ("Knowledge", "MMLU"), | |
"MT - xx-en": ("NLG", "MT(xx-en/fr)"), | |
"MT - en-xx": ("NLG", "MT(en/fr-xx)"), | |
} | |
MODEL_MAP = { | |
"AfroLlama-V1": "AfroLLaMa 8B", | |
"LLaMAX3-8B-Alpaca": "LLaMAX3 8B", | |
"Llama-2-7b-chat-hf": "LLaMa2 7b", | |
"Llama-3.1-70B-Instruct": "LLaMa3.1 70B", | |
"Llama-3.1-8B-Instruct": "LLaMa3.1 8B", | |
"Meta-Llama-3-8B-Instruct": "LLaMa3 8B", | |
"aya-101": "Aya-101 13B", | |
"gemma-1.1-7b-it": "Gemma1.1 7b", | |
"gemma-2-27b-it": "Gemma2 27b", | |
"gemma-2-9b-it": "Gemma2 9b", | |
"gemini-1.5-pro-002": "Gemini 1.5 pro", | |
"gpt-4o-2024-08-06": "GPT-4o (Aug)", | |
"Gemma 2 IT 27B": "Gemma2 27b", | |
"Gemma 2 IT 9B": "Gemma2 9b", | |
"Aya-101": "Aya-101 13B", | |
"Meta-Llama-3.1-70B-Instruct": "LLaMa3.1 70B", | |
"LLaMAX3-8B": "LLaMAX3 8B", | |
"LLaMaX 3 8B": "LLaMAX3 8B", | |
"Meta-Llama-3-70B-Instruct": "LLaMa3.1 70B" | |
} | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # this points to /data | |
results_dir = os.path.join(BASE_DIR, "results") | |
community_results = os.path.join(BASE_DIR, "community_results") | |
output_direc = os.path.join(BASE_DIR, "leaderboard_json") | |
def generate_json_files(results=results_dir, community_result=community_results, | |
output_dir=output_direc, leaderboard=None): | |
os.makedirs(output_dir, exist_ok=True) | |
# Dictionary to store either per-task JSON data or the leaderboard | |
task_data = {} | |
leaderboard_data = {} | |
task_map = {key.lower(): value for key, value in TASK_MAPPING.items()} | |
afrobench_tasks = ["MasakhaPOS", "MasakhaNER", "Sentiment", "TC", "InjongoIntent", "AfriHate", "AfriXNLI", | |
"AfriQA", "UHURA", "RC", "MMLU", "AfriMGSM", "MT - en-xx", "MT - xx-en", "XLSUM", "ADR"] | |
afrobench_tasks = [task.lower() for task in afrobench_tasks] | |
afrobench_lite_datasets = ["injongointent", "sib", "afrixnli", "belebele", "afrimmlu", "afrimgsm", | |
"flores - en_xx"] | |
afrobench_lite_languages = ["amh", "hau", "ibo", "kin", "lin", "lug", "orm", "sna", "sot", "swa", "xho", "yor", "zul", "wol"] | |
# Process each CSV file | |
for filename in os.listdir(results): | |
if filename.endswith(".csv") and '- June2025.csv' not in filename: | |
file_path = os.path.join(results, filename) | |
dataset_name = filename.replace(" - 0-shot.csv", "").replace(" 0-shot.csv", "") | |
# Identify task & subtask | |
task_info = task_map.get(dataset_name.lower()) | |
if not task_info: | |
print(f"Skipping unmapped dataset: {dataset_name.lower()}") | |
continue | |
task, subtask = task_info | |
# Read CSV | |
df = pd.read_csv(file_path) | |
drop_col = [i for i in df.columns if 'unnamed' in i.lower()] | |
df.drop(drop_col, axis=1, inplace=True) | |
# Standardize model names | |
df.loc[df["model"].str.contains("LLaMaX", case=False), "model"] = "LLaMaX 3 8B" | |
df = df[df["model"] != "InkubaLM-0.4B"].copy() | |
df = df[df["model"] != "Claude 3.5 Sonnet"].copy() | |
df.loc[df["model"].str.contains("gpt", case=False), "model"] = "gpt-4o-2024-08-06" | |
df.loc[df["model"].str.contains("gemini", case=False), "model"] = "gemini-1.5-pro-002" | |
df["model"] = df["model"].map(MODEL_MAP) | |
# Extract models | |
models = df["model"].unique() | |
all_columns = list(df.columns) | |
meta_columns = ["model", "prompt", "avg_score", "avg"] | |
language_columns = [col for col in all_columns if col not in meta_columns] | |
language_columns = [col for col in language_columns if col.lower() not in {"eng", "fra", | |
"eng_latn, fra_latn", "en", "fr"}] | |
avg_col = "avg" if "avg" in df.columns else "avg_score" | |
if leaderboard == "afrobench": | |
# Initialize leaderboard structure | |
if dataset_name.lower() not in afrobench_tasks: | |
continue | |
if task not in leaderboard_data: | |
leaderboard_data[task] = {} | |
if subtask not in leaderboard_data[task]: | |
leaderboard_data[task][subtask] = {"datasets": {}} | |
# Store per-model dataset scores | |
dataset_scores = {} | |
for model in models: | |
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
scores = [best_avg_row[col] for col in language_columns if col in best_avg_row] | |
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1) | |
leaderboard_data[task][subtask]["datasets"][dataset_name] = dataset_scores | |
elif leaderboard == "afrobench_lite": | |
if 'lite_language_scores' not in locals(): | |
lite_language_scores = {} | |
if dataset_name in afrobench_lite_datasets: | |
if subtask not in leaderboard_data: | |
leaderboard_data[subtask] = {} | |
# Store per-model dataset scores | |
dataset_scores = {} | |
for model in models: | |
df.fillna(0, inplace=True) | |
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
scores = [best_avg_row[[c for c in best_avg_row.index if c.split('_')[0] == lang][0]] for lang | |
in afrobench_lite_languages if | |
any(c.split('_')[0] == lang for c in best_avg_row.index)] | |
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, | |
1) # Avoid division by zero | |
if model not in lite_language_scores: | |
lite_language_scores[model] = {} | |
for lang in afrobench_lite_languages: | |
if lang in df.columns: | |
val = df.loc[df["model"] == model, lang].values | |
if val.size > 0: | |
lite_language_scores[model].setdefault(lang, []).append(val[0]) | |
df = pd.read_csv(os.path.join(community_result, "New Results - June2025.csv")) | |
df = df[df['task'] == dataset_name] | |
df.fillna(0, inplace=True) | |
models = df["model"].unique() | |
for model in models: | |
scores = [df.loc[df["model"] == model, col].values[0] for col in afrobench_lite_languages | |
if col in df.columns] | |
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1) | |
if model not in lite_language_scores: | |
lite_language_scores[model] = {} | |
for lang in afrobench_lite_languages: | |
if lang in df.columns: | |
val = df.loc[df["model"] == model, lang].values | |
if val.size > 0: | |
lite_language_scores[model].setdefault(lang, []).append(val[0]) | |
leaderboard_data[subtask][dataset_name] = dataset_scores | |
else: | |
# Initialize task & subtask structure | |
if task not in task_data: | |
task_data[task] = {"task": task, "subtasks": {}} | |
if subtask not in task_data[task]["subtasks"]: | |
task_data[task]["subtasks"][subtask] = {"datasets": {}} | |
# Store per-task dataset data | |
task_data[task]["subtasks"][subtask]["datasets"][dataset_name] = { | |
"languages": language_columns, | |
"scores": {} | |
} | |
for model in models: | |
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()] | |
model_scores = [round(score, 1) for score in best_avg_row[language_columns].to_list()] | |
task_data[task]["subtasks"][subtask]["datasets"][dataset_name]["scores"][model] = model_scores | |
# Save leaderboard JSON if enabled | |
if leaderboard: | |
output_path = os.path.join(output_dir, f"{leaderboard}.json") | |
with open(output_path, "w", encoding="utf-8") as json_file: | |
json.dump(leaderboard_data, json_file, indent=4) | |
print("Leaderboard JSON generated successfully!") | |
if leaderboard == "afrobench_lite": | |
lang_output = os.path.join(output_dir, "lite_language_scores.json") | |
averaged_scores = { | |
model: { | |
lang: round(sum(scores) / len(scores), 1) | |
for lang, scores in langs.items() | |
} | |
for model, langs in lite_language_scores.items() | |
} | |
with open(lang_output, "w", encoding="utf-8") as f: | |
json.dump(averaged_scores, f, indent=4) | |
print("Saved language version for lite") | |
# Save per-task JSON files if leaderboard=False | |
else: | |
for task, data in task_data.items(): | |
output_path = os.path.join(output_dir, f"{task.lower().replace(' ', '_')}.json") | |
with open(output_path, "w", encoding="utf-8") as json_file: | |
json.dump(data, json_file, indent=4) | |
print("Task-wise JSON files with subtasks generated successfully!") | |
generate_json_files(leaderboard="afrobench_lite") | |