import pandas as pd import json from .config import base_info, language_list, delimiter, avg_label, round_precision def load_tables(file_path: str) -> pd.DataFrame: """ Load and process the leaderboard data from a JSONL file. - Flattens nested JSON structures. - Computes total scores for each language. - Sorts models by their performance. """ data = [] with open(file_path, "r", encoding="utf-8") as f: for line in f: json_obj = json.loads(line) # Load each JSON object from the file flattened = pd.json_normalize(json_obj, sep=delimiter) # Flatten the nested JSON structure data.append(flattened) # Combine all JSON objects into a single DataFrame df = pd.concat(data, ignore_index=True) # Round numeric values to the specified precision df = df.map(lambda x: round(x, round_precision) if isinstance(x, (int, float)) else x) base = pd.DataFrame() # Extract base information (e.g., model name, type, size) for info in base_info: base[info["display"]] = df[info["key"]] # Create the main leaderboard table main_table = base.copy() detailed_tables = [] for lang in language_list: # Add total scores for each language to the main table main_table[lang['display']] = df[f"{lang['key']}{delimiter}total"] # Identify all columns related to the language cols = [col for col in df.columns if col.startswith(lang["key"])] total_col = None table = base.copy() for col in cols: display_col = col.split(delimiter)[:-1] # Extract display column name # Identify the total column (if it exists) if len(display_col) == 1: total_col = col # Format column name for better readability display_col = col if len(display_col) < 2 else " - ".join(display_col[1:]) table[display_col] = df[col] # If a total column exists, move it to the front and sort the table if total_col: total_col_data = table.pop(total_col) table.insert(len(base.columns), "Total", total_col_data) table = table.sort_values(by="Total", ascending=False) detailed_tables.append(table) # Compute the overall average score for Indonesian languages main_table[avg_label] = sum( [main_table[lang["display"]] if lang["main_table_avg"] else 0 for lang in language_list] ) main_table[avg_label] = round( main_table[avg_label] / sum(lang["main_table_avg"] for lang in language_list), round_precision ) # Move the average score column to the rightmost position last_col = main_table.pop(main_table.columns[-1]) main_table.insert(len(base.columns), last_col.name, last_col) # Sort models by the average score in descending order main_table = main_table.sort_values(by=avg_label, ascending=False) # Return structured leaderboard tables (overall + language-specific) return [{"name": "Overall", "table": main_table, "hidden_col": []}] + [ {"name": lang["tab"], "table": table, "hidden_col": lang["hidden_col"]} for lang, table in zip(language_list, detailed_tables) ]