Sahabat-AI-Leaderboard

Running

App Files Files Community

Sahabat-AI-Leaderboard / src /populate.py

kiliangoto

Initial commit of Sahabat-AI Leaderboard

ef54478 15 days ago

raw

history blame contribute delete

3.3 kB

	import pandas as pd
	import json

	from .config import base_info, language_list, delimiter, avg_label, round_precision

	def load_tables(file_path: str) -> pd.DataFrame:
	"""
	Load and process the leaderboard data from a JSONL file.
	- Flattens nested JSON structures.
	- Computes total scores for each language.
	- Sorts models by their performance.
	"""
	data = []
	with open(file_path, "r", encoding="utf-8") as f:
	for line in f:
	json_obj = json.loads(line) # Load each JSON object from the file
	flattened = pd.json_normalize(json_obj, sep=delimiter) # Flatten the nested JSON structure
	data.append(flattened)

	# Combine all JSON objects into a single DataFrame
	df = pd.concat(data, ignore_index=True)

	# Round numeric values to the specified precision
	df = df.map(lambda x: round(x, round_precision) if isinstance(x, (int, float)) else x)

	base = pd.DataFrame()

	# Extract base information (e.g., model name, type, size)
	for info in base_info:
	base[info["display"]] = df[info["key"]]

	# Create the main leaderboard table
	main_table = base.copy()

	detailed_tables = []

	for lang in language_list:
	# Add total scores for each language to the main table
	main_table[lang['display']] = df[f"{lang['key']}{delimiter}total"]

	# Identify all columns related to the language
	cols = [col for col in df.columns if col.startswith(lang["key"])]
	total_col = None
	table = base.copy()

	for col in cols:
	display_col = col.split(delimiter)[:-1] # Extract display column name

	# Identify the total column (if it exists)
	if len(display_col) == 1:
	total_col = col

	# Format column name for better readability
	display_col = col if len(display_col) < 2 else " - ".join(display_col[1:])
	table[display_col] = df[col]

	# If a total column exists, move it to the front and sort the table
	if total_col:
	total_col_data = table.pop(total_col)
	table.insert(len(base.columns), "Total", total_col_data)
	table = table.sort_values(by="Total", ascending=False)

	detailed_tables.append(table)

	# Compute the overall average score for Indonesian languages
	main_table[avg_label] = sum(
	[main_table[lang["display"]] if lang["main_table_avg"] else 0 for lang in language_list]
	)
	main_table[avg_label] = round(
	main_table[avg_label] / sum(lang["main_table_avg"] for lang in language_list), round_precision
	)

	# Move the average score column to the rightmost position
	last_col = main_table.pop(main_table.columns[-1])
	main_table.insert(len(base.columns), last_col.name, last_col)

	# Sort models by the average score in descending order
	main_table = main_table.sort_values(by=avg_label, ascending=False)

	# Return structured leaderboard tables (overall + language-specific)
	return [{"name": "Overall", "table": main_table, "hidden_col": []}] + [
	{"name": lang["tab"], "table": table, "hidden_col": lang["hidden_col"]}
	for lang, table in zip(language_list, detailed_tables)
	]