Spaces:
Sleeping
Sleeping
File size: 8,683 Bytes
c44c829 cb27368 7798c9f c44c829 cb27368 c44c829 cb27368 7798c9f cb27368 431920c 7798c9f 431920c a38f44d cb27368 431920c cb27368 431920c cb27368 c44c829 a8af4f1 c44c829 58140dc cb27368 c44c829 a8af4f1 c44c829 c43803f 22c1e11 c44c829 cb27368 f64ee28 cb27368 f64ee28 cb27368 c44c829 cb27368 bea879e f64ee28 cb27368 c44c829 22c1e11 c44c829 f64ee28 c9f6bdd f64ee28 c44c829 cb27368 c44c829 cb27368 c44c829 cb27368 bd59416 cb27368 c44c829 cb27368 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Tuple
import collections
from mmlu_pro_eval_adapted import load_mmlu_pro
def calculate_dataset_statistics():
"""
Calculate detailed statistics about the MMLU-Pro dataset
Returns:
Dict: Dictionary containing dataset statistics
"""
try:
# Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
test_df, val_df = load_mmlu_pro()
# Ensure consistent ordering
test_df = test_df.sort_values(['category', 'question_id'])
# Calculate total questions
total_questions = len(test_df)
# Calculate subject-wise question counts efficiently
subject_counts = test_df['category'].value_counts().to_dict()
# Count options per question efficiently using `.apply()`
options_counts = test_df['options'].apply(len).tolist()
max_options = max(options_counts)
avg_options = sum(options_counts) / len(options_counts)
# Count frequency of each option count
options_distribution = collections.Counter(options_counts)
return {
"total_questions": total_questions,
"subject_counts": subject_counts,
"max_options": max_options,
"avg_options": avg_options,
"options_distribution": options_distribution
}
except Exception as e:
print(f"Error calculating dataset statistics: {e}")
# Fallback values if calculation fails
return {
"total_questions": 12032,
"subject_counts": {"Total": 12032},
"max_options": 10,
"avg_options": 10.0,
"options_distribution": {10: 12032}
}
def mmlupro_dataset_preview(regenerate_preview=True) -> Dict[str, Any]:
"""
Generate or retrieve the MMLU-Pro dataset preview information.
Returns:
Dict[str, Any]: Dictionary containing dataset information
"""
preview_file = "/data/mmlu_pro_dataset_preview_table.json"
# Check if preview file exists
if not regenerate_preview and os.path.exists(preview_file):
try:
# Read existing preview file
with open(preview_file, 'r') as f:
preview_data = json.load(f)
print("BOOYAH")
return preview_data
except Exception as e:
print(f"Error reading preview file: {e}")
# If file exists but can't be read, regenerate it
# Generate preview data if file doesn't exist or couldn't be read
try:
# Calculate dataset statistics
stats = calculate_dataset_statistics()
# Format options distribution as a string
options_dist_str = f"Maximum: {stats['max_options']}\nAverage: {stats['avg_options']:.2f}\n"
sorted_options = sorted(stats["options_distribution"].items(), key=lambda x: x[0], reverse=True)
for num_options, count in sorted_options:
options_dist_str += f"{num_options}-choices: {count}, "
options_dist_str = options_dist_str.rstrip(", ")
# Create preview data
preview_data = {
"dataset_name": "MMLU-Pro",
"evaluation_type": "Multiple Choice",
"description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
"links": {
"huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
"github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
"paper": "https://arxiv.org/abs/2406.01574"
},
"organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
"total_questions": stats["total_questions"],
"subject_counts": stats["subject_counts"],
"choices_per_question": options_dist_str
}
# Save preview data to file
try:
with open(preview_file, 'w') as f:
json.dump(preview_data, f, indent=2)
except Exception as e:
print(f"Error writing preview file: {e}")
except Exception as e:
# If calculation fails, fall back to hardcoded values
print(f"Error calculating dynamic values: {e}")
# Hardcoded fallback values
num_questions = 12032
preview_data = {
"dataset_name": "MMLU-Pro",
"evaluation_type": "Multiple Choice",
"description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
"links": {
"huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
"github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
"paper": "https://arxiv.org/abs/2406.01574"
},
"organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
"total_questions": 12032,
"subject_counts": f"Total: 12032 (Note: Using fallback value)",
"choices_per_question": "Maximum: 10\nAverage: 10.0\n10-choices: 12032"
}
return preview_data
def subject_counts_formatting(subject_counts, total_questions):
# Format subject counts as a string, in descending order
sorted_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)
subject_counts_str = f"Total: {total_questions}\n"
for subject, count in sorted_subjects:
subject_counts_str += f"{subject}: {count}\n"
subject_counts_str = subject_counts_str.strip()
return subject_counts_str
def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
"""
Format the preview data with improved readability for display in Gradio
Args:
preview_data (Dict[str, Any]): Dataset preview information
Returns:
pd.DataFrame: Formatted data for display
"""
# Create links with bullet points
links_value = (
f"Dataset: {preview_data['links']['huggingface']}\n"
f"GitHub: {preview_data['links']['github']}\n"
f"Paper: {preview_data['links']['paper']}"
)
links_formatted = "• " + "\n• ".join(links_value.split('\n'))
# Create a table format with better column names
rows = [
{"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]},
{"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]},
{"Dataset Property": "Description", "Details": preview_data["description"]},
{"Dataset Property": "Links", "Details": links_formatted},
{"Dataset Property": "Organization", "Details": preview_data["organization"]},
{"Dataset Property": "Number of Questions", "Details": subject_counts_formatting(preview_data["subject_counts"],preview_data["total_questions"])},
{"Dataset Property": "Choices per Question", "Details": preview_data["choices_per_question"]}
]
return pd.DataFrame(rows)
# For standalone testing
if __name__ == "__main__":
preview_data = mmlupro_dataset_preview()
print("Preview data generated:")
for key, value in preview_data.items():
if key != "links":
print(f"\n{key}:\n{value}") |