H2H-eval-comparator

Sleeping

App Files Files Community

H2H-eval-comparator / dataset_previews.py

rohansampath

Update dataset_previews.py

c9f6bdd verified 6 months ago

raw

history blame contribute delete

8.68 kB

	import os
	import json
	import pandas as pd
	import numpy as np
	from typing import Dict, Any, List, Tuple
	import collections
	from mmlu_pro_eval_adapted import load_mmlu_pro

	def calculate_dataset_statistics():
	"""
	Calculate detailed statistics about the MMLU-Pro dataset

	Returns:
	Dict: Dictionary containing dataset statistics
	"""
	try:
	# Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
	test_df, val_df = load_mmlu_pro()

	# Ensure consistent ordering
	test_df = test_df.sort_values(['category', 'question_id'])

	# Calculate total questions
	total_questions = len(test_df)

	# Calculate subject-wise question counts efficiently
	subject_counts = test_df['category'].value_counts().to_dict()

	# Count options per question efficiently using `.apply()`
	options_counts = test_df['options'].apply(len).tolist()
	max_options = max(options_counts)
	avg_options = sum(options_counts) / len(options_counts)

	# Count frequency of each option count
	options_distribution = collections.Counter(options_counts)

	return {
	"total_questions": total_questions,
	"subject_counts": subject_counts,
	"max_options": max_options,
	"avg_options": avg_options,
	"options_distribution": options_distribution
	}

	except Exception as e:
	print(f"Error calculating dataset statistics: {e}")
	# Fallback values if calculation fails
	return {
	"total_questions": 12032,
	"subject_counts": {"Total": 12032},
	"max_options": 10,
	"avg_options": 10.0,
	"options_distribution": {10: 12032}
	}

	def mmlupro_dataset_preview(regenerate_preview=True) -> Dict[str, Any]:
	"""
	Generate or retrieve the MMLU-Pro dataset preview information.

	Returns:
	Dict[str, Any]: Dictionary containing dataset information
	"""
	preview_file = "/data/mmlu_pro_dataset_preview_table.json"

	# Check if preview file exists
	if not regenerate_preview and os.path.exists(preview_file):
	try:
	# Read existing preview file
	with open(preview_file, 'r') as f:
	preview_data = json.load(f)
	print("BOOYAH")
	return preview_data
	except Exception as e:
	print(f"Error reading preview file: {e}")
	# If file exists but can't be read, regenerate it

	# Generate preview data if file doesn't exist or couldn't be read
	try:
	# Calculate dataset statistics
	stats = calculate_dataset_statistics()

	# Format options distribution as a string
	options_dist_str = f"Maximum: {stats['max_options']}\nAverage: {stats['avg_options']:.2f}\n"
	sorted_options = sorted(stats["options_distribution"].items(), key=lambda x: x[0], reverse=True)
	for num_options, count in sorted_options:
	options_dist_str += f"{num_options}-choices: {count}, "
	options_dist_str = options_dist_str.rstrip(", ")

	# Create preview data
	preview_data = {
	"dataset_name": "MMLU-Pro",
	"evaluation_type": "Multiple Choice",
	"description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
	"links": {
	"huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
	"github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
	"paper": "https://arxiv.org/abs/2406.01574"
	},
	"organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
	"total_questions": stats["total_questions"],
	"subject_counts": stats["subject_counts"],
	"choices_per_question": options_dist_str
	}

	# Save preview data to file
	try:
	with open(preview_file, 'w') as f:
	json.dump(preview_data, f, indent=2)
	except Exception as e:
	print(f"Error writing preview file: {e}")

	except Exception as e:
	# If calculation fails, fall back to hardcoded values
	print(f"Error calculating dynamic values: {e}")
	# Hardcoded fallback values
	num_questions = 12032

	preview_data = {
	"dataset_name": "MMLU-Pro",
	"evaluation_type": "Multiple Choice",
	"description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
	"links": {
	"huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
	"github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
	"paper": "https://arxiv.org/abs/2406.01574"
	},
	"organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
	"total_questions": 12032,
	"subject_counts": f"Total: 12032 (Note: Using fallback value)",
	"choices_per_question": "Maximum: 10\nAverage: 10.0\n10-choices: 12032"
	}

	return preview_data

	def subject_counts_formatting(subject_counts, total_questions):
	# Format subject counts as a string, in descending order
	sorted_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)
	subject_counts_str = f"Total: {total_questions}\n"
	for subject, count in sorted_subjects:
	subject_counts_str += f"{subject}: {count}\n"
	subject_counts_str = subject_counts_str.strip()
	return subject_counts_str



	def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
	"""
	Format the preview data with improved readability for display in Gradio

	Args:
	preview_data (Dict[str, Any]): Dataset preview information

	Returns:
	pd.DataFrame: Formatted data for display
	"""
	# Create links with bullet points
	links_value = (
	f"Dataset: {preview_data['links']['huggingface']}\n"
	f"GitHub: {preview_data['links']['github']}\n"
	f"Paper: {preview_data['links']['paper']}"
	)
	links_formatted = "• " + "\n• ".join(links_value.split('\n'))

	# Create a table format with better column names
	rows = [
	{"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]},
	{"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]},
	{"Dataset Property": "Description", "Details": preview_data["description"]},
	{"Dataset Property": "Links", "Details": links_formatted},
	{"Dataset Property": "Organization", "Details": preview_data["organization"]},
	{"Dataset Property": "Number of Questions", "Details": subject_counts_formatting(preview_data["subject_counts"],preview_data["total_questions"])},
	{"Dataset Property": "Choices per Question", "Details": preview_data["choices_per_question"]}
	]

	return pd.DataFrame(rows)

	# For standalone testing
	if __name__ == "__main__":
	preview_data = mmlupro_dataset_preview()
	print("Preview data generated:")
	for key, value in preview_data.items():
	if key != "links":
	print(f"\n{key}:\n{value}")