AfroBench / data /format_data.py
JessicaOjo's picture
Upload folder using huggingface_hub (#1)
a147f3f verified
import os
import json
import glob
import pandas as pd
# Define task, subtask, and dataset mapping
TASK_MAPPING = {
"MasakhaPOS": ("NLU", "POS"),
"MasakhaNER": ("NLU", "NER"),
"AfriSenti": ("NLU", "Senti"),
"NollySenti": ("NLU", "Senti"),
"InjongoIntent": ("NLU", "Intent"),
"MasakhaNEWS": ("NLU", "Topic"),
"SIB": ("NLU", "Topic"),
"AfriHate": ("NLU", "Hate"),
"AfriXNLI": ("NLU", "NLI"),
"AfriQA": ("QA", "XQA"),
"Belebele": ("QA", "RC"),
"NaijaRC": ("QA", "RC"),
"UHURA": ("Knowledge", "Arc-E"),
"OpenAIMMLU": ("Knowledge", "MMLU"),
"AfriMMLU": ("Knowledge", "MMLU"),
"AfriMGSM": ("Reasoning", "Math"),
"SALT - en_xx": ("NLG", "MT(en/fr-xx)"),
"SALT - xx_en": ("NLG", "MT(xx-en/fr)"),
"Flores - en_xx": ("NLG", "MT(en/fr-xx)"),
"Flores - xx_en": ("NLG", "MT(xx-en/fr)"),
"MAFAND - en_xx": ("NLG", "MT(en/fr-xx)"),
"MAFAND - xx_en": ("NLG", "MT(xx-en/fr)"),
"NTREX - en_xx": ("NLG", "MT(en/fr-xx)"),
"NTREX - xx_en": ("NLG", "MT(xx-en/fr)"),
"XLSUM": ("NLG", "SUMM"),
"ADR": ("NLG", "ADR"),
"RC": ("QA", "RC"),
"Sentiment": ("NLU", "Senti"),
"TC": ("NLU", "Topic"),
"MMLU": ("Knowledge", "MMLU"),
"MT - xx-en": ("NLG", "MT(xx-en/fr)"),
"MT - en-xx": ("NLG", "MT(en/fr-xx)"),
}
MODEL_MAP = {
"AfroLlama-V1": "AfroLLaMa 8B",
"LLaMAX3-8B-Alpaca": "LLaMAX3 8B",
"Llama-2-7b-chat-hf": "LLaMa2 7b",
"Llama-3.1-70B-Instruct": "LLaMa3.1 70B",
"Llama-3.1-8B-Instruct": "LLaMa3.1 8B",
"Meta-Llama-3-8B-Instruct": "LLaMa3 8B",
"aya-101": "Aya-101 13B",
"gemma-1.1-7b-it": "Gemma1.1 7b",
"gemma-2-27b-it": "Gemma2 27b",
"gemma-2-9b-it": "Gemma2 9b",
"gemini-1.5-pro-002": "Gemini 1.5 pro",
"gpt-4o-2024-08-06": "GPT-4o (Aug)",
"Gemma 2 IT 27B": "Gemma2 27b",
"Gemma 2 IT 9B": "Gemma2 9b",
"Aya-101": "Aya-101 13B",
"Meta-Llama-3.1-70B-Instruct": "LLaMa3.1 70B",
"LLaMAX3-8B": "LLaMAX3 8B",
"LLaMaX 3 8B": "LLaMAX3 8B",
"Meta-Llama-3-70B-Instruct": "LLaMa3.1 70B"
}
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # this points to /data
results_dir = os.path.join(BASE_DIR, "results")
community_results = os.path.join(BASE_DIR, "community_results")
output_direc = os.path.join(BASE_DIR, "leaderboard_json")
def generate_json_files(results=results_dir, community_result=community_results,
output_dir=output_direc, leaderboard=None):
os.makedirs(output_dir, exist_ok=True)
# Dictionary to store either per-task JSON data or the leaderboard
task_data = {}
leaderboard_data = {}
task_map = {key.lower(): value for key, value in TASK_MAPPING.items()}
afrobench_tasks = ["MasakhaPOS", "MasakhaNER", "Sentiment", "TC", "InjongoIntent", "AfriHate", "AfriXNLI",
"AfriQA", "UHURA", "RC", "MMLU", "AfriMGSM", "MT - en-xx", "MT - xx-en", "XLSUM", "ADR"]
afrobench_tasks = [task.lower() for task in afrobench_tasks]
afrobench_lite_datasets = ["injongointent", "sib", "afrixnli", "belebele", "afrimmlu", "afrimgsm",
"flores - en_xx"]
afrobench_lite_languages = ["amh", "hau", "ibo", "kin", "lin", "lug", "orm", "sna", "sot", "swa", "xho", "yor", "zul", "wol"]
# Process each CSV file
for filename in os.listdir(results):
if filename.endswith(".csv") and '- June2025.csv' not in filename:
file_path = os.path.join(results, filename)
dataset_name = filename.replace(" - 0-shot.csv", "").replace(" 0-shot.csv", "")
# Identify task & subtask
task_info = task_map.get(dataset_name.lower())
if not task_info:
print(f"Skipping unmapped dataset: {dataset_name.lower()}")
continue
task, subtask = task_info
# Read CSV
df = pd.read_csv(file_path)
drop_col = [i for i in df.columns if 'unnamed' in i.lower()]
df.drop(drop_col, axis=1, inplace=True)
# Standardize model names
df.loc[df["model"].str.contains("LLaMaX", case=False), "model"] = "LLaMaX 3 8B"
df = df[df["model"] != "InkubaLM-0.4B"].copy()
df = df[df["model"] != "Claude 3.5 Sonnet"].copy()
df.loc[df["model"].str.contains("gpt", case=False), "model"] = "gpt-4o-2024-08-06"
df.loc[df["model"].str.contains("gemini", case=False), "model"] = "gemini-1.5-pro-002"
df["model"] = df["model"].map(MODEL_MAP)
# Extract models
models = df["model"].unique()
all_columns = list(df.columns)
meta_columns = ["model", "prompt", "avg_score", "avg"]
language_columns = [col for col in all_columns if col not in meta_columns]
language_columns = [col for col in language_columns if col.lower() not in {"eng", "fra",
"eng_latn, fra_latn", "en", "fr"}]
avg_col = "avg" if "avg" in df.columns else "avg_score"
if leaderboard == "afrobench":
# Initialize leaderboard structure
if dataset_name.lower() not in afrobench_tasks:
continue
if task not in leaderboard_data:
leaderboard_data[task] = {}
if subtask not in leaderboard_data[task]:
leaderboard_data[task][subtask] = {"datasets": {}}
# Store per-model dataset scores
dataset_scores = {}
for model in models:
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
scores = [best_avg_row[col] for col in language_columns if col in best_avg_row]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1)
leaderboard_data[task][subtask]["datasets"][dataset_name] = dataset_scores
elif leaderboard == "afrobench_lite":
if 'lite_language_scores' not in locals():
lite_language_scores = {}
if dataset_name in afrobench_lite_datasets:
if subtask not in leaderboard_data:
leaderboard_data[subtask] = {}
# Store per-model dataset scores
dataset_scores = {}
for model in models:
df.fillna(0, inplace=True)
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
scores = [best_avg_row[[c for c in best_avg_row.index if c.split('_')[0] == lang][0]] for lang
in afrobench_lite_languages if
any(c.split('_')[0] == lang for c in best_avg_row.index)]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None,
1) # Avoid division by zero
if model not in lite_language_scores:
lite_language_scores[model] = {}
for lang in afrobench_lite_languages:
if lang in df.columns:
val = df.loc[df["model"] == model, lang].values
if val.size > 0:
lite_language_scores[model].setdefault(lang, []).append(val[0])
df = pd.read_csv(os.path.join(community_result, "New Results - June2025.csv"))
df = df[df['task'] == dataset_name]
df.fillna(0, inplace=True)
models = df["model"].unique()
for model in models:
scores = [df.loc[df["model"] == model, col].values[0] for col in afrobench_lite_languages
if col in df.columns]
dataset_scores[model] = round(sum(scores) / len(scores) if scores else None, 1)
if model not in lite_language_scores:
lite_language_scores[model] = {}
for lang in afrobench_lite_languages:
if lang in df.columns:
val = df.loc[df["model"] == model, lang].values
if val.size > 0:
lite_language_scores[model].setdefault(lang, []).append(val[0])
leaderboard_data[subtask][dataset_name] = dataset_scores
else:
# Initialize task & subtask structure
if task not in task_data:
task_data[task] = {"task": task, "subtasks": {}}
if subtask not in task_data[task]["subtasks"]:
task_data[task]["subtasks"][subtask] = {"datasets": {}}
# Store per-task dataset data
task_data[task]["subtasks"][subtask]["datasets"][dataset_name] = {
"languages": language_columns,
"scores": {}
}
for model in models:
best_avg_row = df[df["model"] == model].loc[df[df["model"] == model][avg_col].idxmax()]
model_scores = [round(score, 1) for score in best_avg_row[language_columns].to_list()]
task_data[task]["subtasks"][subtask]["datasets"][dataset_name]["scores"][model] = model_scores
# Save leaderboard JSON if enabled
if leaderboard:
output_path = os.path.join(output_dir, f"{leaderboard}.json")
with open(output_path, "w", encoding="utf-8") as json_file:
json.dump(leaderboard_data, json_file, indent=4)
print("Leaderboard JSON generated successfully!")
if leaderboard == "afrobench_lite":
lang_output = os.path.join(output_dir, "lite_language_scores.json")
averaged_scores = {
model: {
lang: round(sum(scores) / len(scores), 1)
for lang, scores in langs.items()
}
for model, langs in lite_language_scores.items()
}
with open(lang_output, "w", encoding="utf-8") as f:
json.dump(averaged_scores, f, indent=4)
print("Saved language version for lite")
# Save per-task JSON files if leaderboard=False
else:
for task, data in task_data.items():
output_path = os.path.join(output_dir, f"{task.lower().replace(' ', '_')}.json")
with open(output_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=4)
print("Task-wise JSON files with subtasks generated successfully!")
generate_json_files(leaderboard="afrobench_lite")