Spaces:

MCINext
/

mizan-llm-leaderboard

Running

mizan-llm-leaderboard / leaderboard /refresh.py

mehran

update

fe79a14 17 days ago

23.6 kB

	# leaderboard/refresh.py

	import json
	import logging
	from pathlib import Path
	from typing import Any, Dict, List, Optional
	import pandas as pd
	import yaml

	# --- Logging Setup ---
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(module)s - %(message)s"
	)
	logger = logging.getLogger(__name__)

	# --- Path Definitions ---
	SCRIPT_DIR = Path(__file__).resolve().parent
	PROJECT_ROOT = SCRIPT_DIR.parent

	# --- Default Input/Output Paths ---
	DEFAULT_MODELS_FOLDER = PROJECT_ROOT.parent / "llm-leaderboard/models_info"
	DEFAULT_RESULTS_FOLDER = PROJECT_ROOT.parent / "llm-leaderboard/results"
	OUTPUT_FOLDER = SCRIPT_DIR / "boards_data"
	CONFIG_FILE_PATH = SCRIPT_DIR / "leaderboard_config.yaml"
	TEMPLATE_FOLDER = SCRIPT_DIR / "template_jsons"

	# --- Constants for Subtask Processing ---
	NLU_NLG_TASK_KEYS = ["persian_nlu", "persian_nlg"]

	ALL_LEADERBOARD_COLUMNS = [
	'Model Name', 'model_url', 'parameters_count', 'source_type', 'Average',
	'Persian IFEval', 'Persian MT-Bench', "PerMMLU",
	"PerCoR", "Persian NLU", "Persian NLG"
	]


	def load_tasks_from_config(config_path: Path) -> Dict[str, str]:

	if not config_path.exists():
	logger.error(f"Configuration file not found: {config_path}. Cannot load tasks.")
	return {}
	try:
	with open(config_path, 'r', encoding='utf-8') as f:
	config_data = yaml.safe_load(f)
	tasks_from_config = config_data.get('task_display_names', {})
	if not isinstance(tasks_from_config, dict):
	logger.error(f"'task_display_names' in {config_path} is not a dictionary.")
	return {}
	processed_tasks = {k: v for k, v in tasks_from_config.items() if str(k).lower() != 'all'}
	if not processed_tasks:
	logger.warning(f"No tasks in {config_path} under 'task_display_names' (excluding 'all').")
	return processed_tasks
	except Exception as e:
	logger.error(f"Error loading config {config_path}: {e}")
	return {}

	class ModelEvaluationProcessor:
	def __init__(
	self,
	models_info_path: Path,
	results_base_path: Path,
	output_path: Path,
	template_jsons_path: Path,
	) -> None:

	self.models_info_path = models_info_path
	self.results_base_path = results_base_path
	self.output_path = output_path
	self.template_folder = template_jsons_path
	self.output_path.mkdir(parents=True, exist_ok=True)

	self.tasks_config = load_tasks_from_config(CONFIG_FILE_PATH)
	if not self.tasks_config:
	logger.error("Tasks config is empty. Processing might be affected.")

	self.main_scores_map = {
	"ifeval": "strict_instruction_accuracy",
	"mt_bench": "score_mean",
	"MMLU": "acc",
	"persian_csr": "acc",
	"persian_nlg": "nlg_score",
	"persian_nlu": "nlu_score",
	}
	def _load_template(self, task_key: str) -> Dict[str, Any]:

	path = self.template_folder / f"{task_key}.json"
	try:
	return json.loads(path.read_text(encoding="utf-8"))
	except FileNotFoundError:
	logger.warning(f"Template file not found for task_key {task_key} at {path}. Using empty template.")
	return {}
	except Exception as e:
	logger.error(f"Cannot load template for task_key {task_key} from {path}: {e}")
	return {}

	def _deep_override(self, base: Any, override: Any) -> Any:
	if isinstance(base, dict) and isinstance(override, dict):
	merged = {}
	for k, v_base in base.items():
	if k in override and override[k] is not None and override[k] != -1:
	merged[k] = self._deep_override(v_base, override[k])
	else:
	merged[k] = v_base
	# for k, v_override in override.items():
	# if k not in merged:
	# merged[k] = v_override
	return merged
	elif override is not None and override != -1:
	return override
	else:
	return base


	def _load_model_raw_results(self, model_folder_name: str, task_key: str) -> Dict[str, Any]:

	results_filename = f"{model_folder_name}___{task_key}.json"
	results_file_path = self.results_base_path / results_filename

	if results_file_path.exists():
	try:
	with open(results_file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	return data if isinstance(data, dict) else {}
	except json.JSONDecodeError as e:
	logger.error(f"JSONDecodeError for model '{model_folder_name}', task_key '{task_key}' from {results_file_path}: {e}")
	except Exception as e:
	logger.error(f"Error loading results for model '{model_folder_name}', task_key '{task_key}' from {results_file_path}: {e}")
	else:
	logger.warning(f"Results file not found for model '{model_folder_name}', task_key '{task_key}' at {results_file_path}")
	return {}

	def load_and_fill_task_results(self, model_folder_name: str, task_key: str) -> Dict[str, Any]:

	template = self._load_template(task_key)
	raw_results = self._load_model_raw_results(model_folder_name, task_key)
	return self._deep_override(template, raw_results)

	def clean_previous_subtask_files(self) -> None:

	logger.info("Cleaning previous NLU/NLG subtask JSONL files...")
	for task_key_prefix in NLU_NLG_TASK_KEYS:
	for result_file in self.results_base_path.glob(f"*___{task_key_prefix}.json"):
	try:
	task_data_content = result_file.read_text(encoding="utf-8")
	if not task_data_content.strip():
	logger.debug(f"Skipping empty result file for subtask cleaning: {result_file}")
	continue
	task_data = json.loads(task_data_content)

	main_score_for_this_task_prefix = self.main_scores_map.get(task_key_prefix)

	for subtask_name in task_data:
	if subtask_name == main_score_for_this_task_prefix:
	continue
	if isinstance(task_data.get(subtask_name), dict):
	subtask_output_path = self.output_path / f"{subtask_name}.jsonl"
	if subtask_output_path.exists():
	subtask_output_path.unlink()
	logger.info(f"Deleted previous subtask file: {subtask_output_path}")
	except json.JSONDecodeError as e:
	logger.warning(f"Failed to decode JSON for subtask cleaning from {result_file}: {e}")
	except Exception as e:
	logger.warning(f"Failed to inspect/delete subtask files based on {result_file}: {e}")

	def _process_subtask_data(self, task_results: Dict[str, Any], base_model_info: Dict[str, Any], parent_task_main_score_key: Optional[str], parent_task_key_for_log: str) -> None:

	parent_task_main_score_value = task_results.get(parent_task_main_score_key) if parent_task_main_score_key else None

	for subtask_name, subtask_scores_dict in task_results.items():
	if subtask_name == parent_task_main_score_key:
	continue
	if not isinstance(subtask_scores_dict, dict):
	logger.debug(f"Skipping entry '{subtask_name}' in '{parent_task_key_for_log}': not a dictionary of subtask scores.")
	continue

	row_data = base_model_info.copy()
	row_data.update(subtask_scores_dict)

	if parent_task_main_score_key:
	row_data[parent_task_main_score_key] = parent_task_main_score_value

	subtask_output_file = f"{subtask_name}.jsonl"
	subtask_output_path = self.output_path / subtask_output_file

	try:
	current_entries = []
	if subtask_output_path.exists():
	existing_df = pd.read_json(subtask_output_path, lines=True)
	if not existing_df.empty and 'Model Name' in existing_df.columns:
	current_entries = existing_df[existing_df['Model Name'] != row_data['Model Name']].to_dict(orient='records')

	current_entries.append(row_data)
	updated_df = pd.DataFrame(current_entries)
	updated_df.to_json(subtask_output_path, orient="records", lines=True, force_ascii=False)
	logger.debug(f"Updated subtask file: {subtask_output_path} for model {base_model_info.get('Model Name')}, parent task {parent_task_key_for_log}")
	except Exception as e:
	logger.error(f"Error updating subtask file {subtask_output_path} for parent {parent_task_key_for_log}: {e}")
	def process_nlu_nlg_subtasks(self, model_details: Dict[str, Any], model_folder_name: str, canonical_model_name: str) -> None:

	common_subtask_model_info = {
	"Model Name": canonical_model_name,
	"model_url": model_details.get('model_url', model_details.get('link', model_details.get('homepage', 'https://google.com'))),
	"parameters_count": str(model_details.get('n_parameters', "N/A")),
	"source_type": "Closed-Source" # Default, will be refined
	}
	parameters_count_raw = model_details.get('n_parameters', None)
	if parameters_count_raw is not None:
	is_open_source_candidate = False
	if isinstance(parameters_count_raw, (int, float)) and parameters_count_raw > 0:
	is_open_source_candidate = True
	elif isinstance(parameters_count_raw, str) and \
	str(parameters_count_raw).strip().lower() not in ["", "n/a", "unknown", "private", "confidential", "tbd", "null", "closed"]:
	is_open_source_candidate = True
	common_subtask_model_info["source_type"] = "Open-Source" if is_open_source_candidate else "Closed-Source"

	for task_key_for_subtasks in NLU_NLG_TASK_KEYS:
	if task_key_for_subtasks not in self.tasks_config:
	logger.debug(f"Subtask processing for '{task_key_for_subtasks}' skipped: not in tasks_config.")
	continue

	logger.info(f"Processing subtasks for '{task_key_for_subtasks}' for model '{canonical_model_name}'...")
	parent_task_full_results = self.load_and_fill_task_results(model_folder_name, task_key_for_subtasks)
	main_score_key_for_parent_task = self.main_scores_map.get(task_key_for_subtasks)
	if not main_score_key_for_parent_task:
	logger.warning(f"No main score key in main_scores_map for parent task '{task_key_for_subtasks}'.")

	self._process_subtask_data(
	parent_task_full_results,
	common_subtask_model_info,
	main_score_key_for_parent_task,
	task_key_for_subtasks
	)
	def process_models(self) -> Dict[str, pd.DataFrame]:
	processed_task_data: Dict[str, List[Dict[str, Any]]] = {task_key: [] for task_key in self.tasks_config.keys()}
	all_models_summary_data: List[Dict[str, Any]] = []

	if not self.models_info_path.exists() or not self.models_info_path.is_dir():
	logger.critical(f"Configured MODELS_FOLDER path does not exist or is not a directory: {self.models_info_path}")
	empty_dfs = {key: pd.DataFrame() for key in self.tasks_config.keys()}
	empty_dfs["all"] = pd.DataFrame()
	return empty_dfs

	model_info_files = list(self.models_info_path.glob("*.json"))
	if not model_info_files:
	logger.warning(f"No model info files (*.json) found in {self.models_info_path}. No models will be processed.")
	empty_dfs = {key: pd.DataFrame() for key in self.tasks_config.keys()}
	empty_dfs["all"] = pd.DataFrame()
	return empty_dfs

	for model_info_file in model_info_files:
	model_folder_name = model_info_file.stem
	try:
	with open(model_info_file, 'r', encoding='utf-8') as f:
	model_details = json.load(f)

	canonical_model_name = model_details.get('name_for_leaderboard',
	model_details.get('model_hf_id',
	model_details.get('name', model_folder_name)))
	model_url = model_details.get('model_url', model_details.get('link', model_details.get('homepage', 'https_google.com')))
	if not model_url: model_url = 'https_google.com'

	parameters_count_raw = model_details.get('n_parameters', None)
	parameters_count_display = str(parameters_count_raw) if parameters_count_raw is not None else "N/A"

	source_type = "Closed-Source"
	if parameters_count_raw is not None:
	is_open_source_candidate = False
	if isinstance(parameters_count_raw, (int, float)) and parameters_count_raw > 0:
	is_open_source_candidate = True
	elif isinstance(parameters_count_raw, str) and \
	str(parameters_count_raw).strip().lower() not in ["", "n/a", "unknown", "private", "confidential", "tbd", "null", "closed"]:
	is_open_source_candidate = True
	source_type = "Open-Source" if is_open_source_candidate else "Closed-Source"

	except Exception as e:
	logger.error(f"Error loading/parsing model info from {model_info_file}: {e}. Skipping '{model_folder_name}'.")
	continue

	logger.info(f"Processing model: {canonical_model_name} (source ID: {model_folder_name})")

	current_model_scores_for_summary: Dict[str, Any] = {
	"Model Name": canonical_model_name,
	"model_url": model_url,
	"parameters_count": parameters_count_display,
	"source_type": source_type
	}

	for task_key, task_display_name in self.tasks_config.items():
	task_specific_results = self.load_and_fill_task_results(model_folder_name, task_key)
	main_score_metric_name = self.main_scores_map.get(task_key)
	task_data_entry_for_specific_jsonl: Dict[str, Any] = {
	"Model Name": canonical_model_name,
	"model_url": model_url,
	"parameters_count": parameters_count_display,
	"source_type": source_type
	}

	if isinstance(task_specific_results, dict) and task_specific_results:
	for metric, value in task_specific_results.items():
	task_data_entry_for_specific_jsonl[metric] = value

	if main_score_metric_name and main_score_metric_name in task_specific_results:
	score_value = task_specific_results[main_score_metric_name]
	if task_key == "mt_bench" and score_value is not None:
	try:
	score_value = float(score_value) / 10.0
	except (ValueError, TypeError):
	logger.warning(f"Could not convert mt_bench score '{score_value}' to float for division for model {canonical_model_name}")
	score_value = pd.NA
	current_model_scores_for_summary[task_display_name] = score_value
	elif main_score_metric_name:
	logger.warning(f"Main score metric '{main_score_metric_name}' for task '{task_key}' (Display: {task_display_name}) not found for model '{canonical_model_name}'. Will be NA.")
	current_model_scores_for_summary[task_display_name] = pd.NA
	task_data_entry_for_specific_jsonl[main_score_metric_name] = pd.NA
	else:
	logger.warning(f"No valid results data for model '{canonical_model_name}', task_key '{task_key}'. Scores will be NA.")
	if main_score_metric_name:
	task_data_entry_for_specific_jsonl[main_score_metric_name] = pd.NA
	current_model_scores_for_summary[task_display_name] = pd.NA

	processed_task_data[task_key].append(task_data_entry_for_specific_jsonl)

	all_models_summary_data.append(current_model_scores_for_summary)
	self.process_nlu_nlg_subtasks(model_details, model_folder_name, canonical_model_name)

	final_dataframes: Dict[str, pd.DataFrame] = {}
	for task_key, data_list in processed_task_data.items():
	df = pd.DataFrame(data_list) if data_list else pd.DataFrame()
	main_score_col = self.main_scores_map.get(task_key)
	if not df.empty and main_score_col and main_score_col in df.columns:
	try:
	df[main_score_col] = pd.to_numeric(df[main_score_col], errors='coerce')
	# Sort by main score (NaNs will go last or first depending on na_position, default is last)
	df = df.sort_values(by=main_score_col, ascending=False, na_position='last')
	except Exception as e:
	logger.warning(f"Could not sort dataframe for task {task_key} by score {main_score_col}: {e}")
	final_dataframes[task_key] = df
	if df.empty:
	logger.warning(f"No data processed for task '{task_key}'. Resulting DataFrame is empty.")

	if all_models_summary_data:
	all_df = pd.DataFrame(all_models_summary_data)
	score_cols_for_average = []
	for _, task_display_name_for_avg in self.tasks_config.items():
	if task_display_name_for_avg in all_df.columns:
	numeric_col = pd.to_numeric(all_df[task_display_name_for_avg], errors='coerce')
	if numeric_col.notna().any(): # Check if there is at least one non-NA numeric value
	all_df[task_display_name_for_avg] = numeric_col
	score_cols_for_average.append(task_display_name_for_avg)
	else: # All values are NA or non-numeric
	all_df[task_display_name_for_avg] = pd.NA # Ensure column is NA if not usable
	logger.warning(f"Column '{task_display_name_for_avg}' for averaging in 'all' table is not numeric or all NaN. Excluding from average calculation and setting to NA.")
	if score_cols_for_average:
	try:
	# Calculate mean; it will be NaN if any constituent score for a row is NaN.
	all_df["Average"] = all_df[score_cols_for_average].mean(axis=1, skipna=False)
	# Round only non-NaN averages
	all_df.loc[all_df["Average"].notna(), "Average"] = all_df.loc[all_df["Average"].notna(), "Average"].round(4)
	except Exception as e:
	logger.error(f"Error calculating 'Average' for 'all' table: {e}. Average column might be NA or incorrect.")
	all_df["Average"] = pd.NA # Fallback to NA
	else:
	logger.warning("No valid numeric score columns found to calculate 'Average' for 'all' table.")
	all_df["Average"] = pd.NA # Assign pd.NA if no columns to average

	# Sort 'all' table by Average (NaNs will be placed last by default with ascending=False)
	if "Average" in all_df.columns: # Check if 'Average' column exists
	# NaNs are typically sorted to the end by default when ascending=False or na_position='last'
	all_df = all_df.sort_values(by="Average", ascending=False, na_position='last')


	existing_cols_in_order = [col for col in ALL_LEADERBOARD_COLUMNS if col in all_df.columns]
	other_cols = [col for col in all_df.columns if col not in existing_cols_in_order]
	all_df = all_df[existing_cols_in_order + other_cols]

	final_dataframes["all"] = all_df
	else:
	final_dataframes["all"] = pd.DataFrame()
	logger.warning("No summary data collected for the 'all' table.")

	return final_dataframes

	def save_dataframe_as_jsonl(self, df: pd.DataFrame, filename_base: str) -> None:

	if df is None or df.empty:
	logger.warning(f"DataFrame for '{filename_base}.jsonl' is empty or None. Skipping save.")
	return
	output_file_path = self.output_path / f"{filename_base}.jsonl"
	try:
	df.to_json(output_file_path, orient="records", lines=True, force_ascii=False, index=False)
	logger.info(f"Saved data to {output_file_path}")
	except Exception as e:
	logger.error(f"Failed to save DataFrame to {output_file_path}: {e}")
	def run(self) -> None:

	logger.info("Starting data processing pipeline in ModelEvaluationProcessor...")
	self.clean_previous_subtask_files()
	processed_dataframes = self.process_models()
	for task_key_or_name, df in processed_dataframes.items():
	self.save_dataframe_as_jsonl(df, task_key_or_name)
	logger.info("Data processing pipeline completed successfully!")

	def main() -> None:

	models_folder_to_use = DEFAULT_MODELS_FOLDER
	results_folder_to_use = DEFAULT_RESULTS_FOLDER
	template_folder_to_use = TEMPLATE_FOLDER

	logger.info(f"Refresh script running from: {SCRIPT_DIR}")
	logger.info(f"CONFIGURED Input 'models_info' Path: {models_folder_to_use}")
	logger.info(f"CONFIGURED Input 'results' Path: {results_folder_to_use}")
	logger.info(f"CONFIGURED Input 'template_jsons' Path: {template_folder_to_use}")
	logger.info(f"Outputting processed data to (inside 'leaderboard' dir): {OUTPUT_FOLDER}")
	logger.info(f"Using configuration file (inside 'leaderboard' dir): {CONFIG_FILE_PATH}")

	if not CONFIG_FILE_PATH.exists():
	logger.critical(f"CRITICAL: Config file not found at {CONFIG_FILE_PATH}. Ensure '{CONFIG_FILE_PATH.name}' exists in '{SCRIPT_DIR}'.")
	return
	if not models_folder_to_use.exists() or not models_folder_to_use.is_dir():
	logger.critical(f"CRITICAL: Input 'models_info' directory not found at {models_folder_to_use} or is not a directory.")
	return
	if not results_folder_to_use.exists() or not results_folder_to_use.is_dir():
	logger.critical(f"CRITICAL: Input 'results' directory not found at {results_folder_to_use} or is not a directory.")
	return
	if not template_folder_to_use.exists() or not template_folder_to_use.is_dir():
	logger.warning(f"WARNING: 'template_jsons' directory not found at {template_folder_to_use}. Template filling might not work as expected.")

	try:
	processor = ModelEvaluationProcessor(
	models_info_path=models_folder_to_use,
	results_base_path=results_folder_to_use,
	output_path=OUTPUT_FOLDER,
	template_jsons_path=template_folder_to_use,
	)
	processor.run()
	except Exception as e:
	logger.error(f"Unhandled exception in main: {e}", exc_info=True)

	if __name__ == "__main__":
	main()