kiliangoto's picture
Initial commit of Sahabat-AI Leaderboard
ef54478
import pandas as pd
import json
from .config import base_info, language_list, delimiter, avg_label, round_precision
def load_tables(file_path: str) -> pd.DataFrame:
"""
Load and process the leaderboard data from a JSONL file.
- Flattens nested JSON structures.
- Computes total scores for each language.
- Sorts models by their performance.
"""
data = []
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
json_obj = json.loads(line) # Load each JSON object from the file
flattened = pd.json_normalize(json_obj, sep=delimiter) # Flatten the nested JSON structure
data.append(flattened)
# Combine all JSON objects into a single DataFrame
df = pd.concat(data, ignore_index=True)
# Round numeric values to the specified precision
df = df.map(lambda x: round(x, round_precision) if isinstance(x, (int, float)) else x)
base = pd.DataFrame()
# Extract base information (e.g., model name, type, size)
for info in base_info:
base[info["display"]] = df[info["key"]]
# Create the main leaderboard table
main_table = base.copy()
detailed_tables = []
for lang in language_list:
# Add total scores for each language to the main table
main_table[lang['display']] = df[f"{lang['key']}{delimiter}total"]
# Identify all columns related to the language
cols = [col for col in df.columns if col.startswith(lang["key"])]
total_col = None
table = base.copy()
for col in cols:
display_col = col.split(delimiter)[:-1] # Extract display column name
# Identify the total column (if it exists)
if len(display_col) == 1:
total_col = col
# Format column name for better readability
display_col = col if len(display_col) < 2 else " - ".join(display_col[1:])
table[display_col] = df[col]
# If a total column exists, move it to the front and sort the table
if total_col:
total_col_data = table.pop(total_col)
table.insert(len(base.columns), "Total", total_col_data)
table = table.sort_values(by="Total", ascending=False)
detailed_tables.append(table)
# Compute the overall average score for Indonesian languages
main_table[avg_label] = sum(
[main_table[lang["display"]] if lang["main_table_avg"] else 0 for lang in language_list]
)
main_table[avg_label] = round(
main_table[avg_label] / sum(lang["main_table_avg"] for lang in language_list), round_precision
)
# Move the average score column to the rightmost position
last_col = main_table.pop(main_table.columns[-1])
main_table.insert(len(base.columns), last_col.name, last_col)
# Sort models by the average score in descending order
main_table = main_table.sort_values(by=avg_label, ascending=False)
# Return structured leaderboard tables (overall + language-specific)
return [{"name": "Overall", "table": main_table, "hidden_col": []}] + [
{"name": lang["tab"], "table": table, "hidden_col": lang["hidden_col"]}
for lang, table in zip(language_list, detailed_tables)
]