File size: 8,683 Bytes
c44c829
 
 
cb27368
 
 
7798c9f
c44c829
cb27368
 
 
c44c829
cb27368
 
 
 
 
7798c9f
cb27368
431920c
7798c9f
 
431920c
a38f44d
cb27368
431920c
 
 
 
 
cb27368
 
431920c
 
cb27368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c44c829
a8af4f1
c44c829
 
 
 
 
 
58140dc
cb27368
c44c829
a8af4f1
c44c829
 
 
 
c43803f
22c1e11
c44c829
 
 
 
 
 
cb27368
 
f64ee28
cb27368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f64ee28
 
cb27368
 
 
 
 
 
 
 
 
 
c44c829
cb27368
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bea879e
f64ee28
cb27368
 
c44c829
22c1e11
c44c829
f64ee28
 
 
 
 
 
 
c9f6bdd
f64ee28
 
 
c44c829
 
cb27368
c44c829
 
 
 
 
 
 
cb27368
 
 
 
 
 
 
 
 
c44c829
cb27368
 
 
 
 
bd59416
cb27368
c44c829
 
cb27368
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import json
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Tuple
import collections
from mmlu_pro_eval_adapted import load_mmlu_pro

def calculate_dataset_statistics():
    """
    Calculate detailed statistics about the MMLU-Pro dataset
    
    Returns:
        Dict: Dictionary containing dataset statistics
    """
    try:
        # Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
        test_df, val_df = load_mmlu_pro()
        
        # Ensure consistent ordering
        test_df = test_df.sort_values(['category', 'question_id'])

        # Calculate total questions
        total_questions = len(test_df)
        
        # Calculate subject-wise question counts efficiently
        subject_counts = test_df['category'].value_counts().to_dict()

        # Count options per question efficiently using `.apply()`
        options_counts = test_df['options'].apply(len).tolist()
        max_options = max(options_counts)
        avg_options = sum(options_counts) / len(options_counts)

        # Count frequency of each option count
        options_distribution = collections.Counter(options_counts)
        
        return {
            "total_questions": total_questions,
            "subject_counts": subject_counts,
            "max_options": max_options,
            "avg_options": avg_options,
            "options_distribution": options_distribution
        }
        
    except Exception as e:
        print(f"Error calculating dataset statistics: {e}")
        # Fallback values if calculation fails
        return {
            "total_questions": 12032,
            "subject_counts": {"Total": 12032},
            "max_options": 10,
            "avg_options": 10.0,
            "options_distribution": {10: 12032}
        }

def mmlupro_dataset_preview(regenerate_preview=True) -> Dict[str, Any]:
    """
    Generate or retrieve the MMLU-Pro dataset preview information.
    
    Returns:
        Dict[str, Any]: Dictionary containing dataset information
    """
    preview_file = "/data/mmlu_pro_dataset_preview_table.json"
    
    # Check if preview file exists
    if not regenerate_preview and os.path.exists(preview_file):
        try:
            # Read existing preview file
            with open(preview_file, 'r') as f:
                preview_data = json.load(f)
                print("BOOYAH")
            return preview_data
        except Exception as e:
            print(f"Error reading preview file: {e}")
            # If file exists but can't be read, regenerate it
    
    # Generate preview data if file doesn't exist or couldn't be read
    try:
        # Calculate dataset statistics
        stats = calculate_dataset_statistics()
                
        # Format options distribution as a string
        options_dist_str = f"Maximum: {stats['max_options']}\nAverage: {stats['avg_options']:.2f}\n"
        sorted_options = sorted(stats["options_distribution"].items(), key=lambda x: x[0], reverse=True)
        for num_options, count in sorted_options:
            options_dist_str += f"{num_options}-choices: {count}, "
        options_dist_str = options_dist_str.rstrip(", ")
        
        # Create preview data
        preview_data = {
            "dataset_name": "MMLU-Pro",
            "evaluation_type": "Multiple Choice",
            "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
            "links": {
                "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
                "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
                "paper": "https://arxiv.org/abs/2406.01574"
            },
            "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
            "total_questions": stats["total_questions"],
            "subject_counts": stats["subject_counts"],
            "choices_per_question": options_dist_str
        }
        
        # Save preview data to file
        try:
            with open(preview_file, 'w') as f:
                json.dump(preview_data, f, indent=2)
        except Exception as e:
            print(f"Error writing preview file: {e}")
            
    except Exception as e:
        # If calculation fails, fall back to hardcoded values
        print(f"Error calculating dynamic values: {e}")
        # Hardcoded fallback values
        num_questions = 12032
        
        preview_data = {
            "dataset_name": "MMLU-Pro",
            "evaluation_type": "Multiple Choice",
            "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
            "links": {
                "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
                "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
                "paper": "https://arxiv.org/abs/2406.01574"
            },
            "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
            "total_questions": 12032,
            "subject_counts": f"Total: 12032 (Note: Using fallback value)",
            "choices_per_question": "Maximum: 10\nAverage: 10.0\n10-choices: 12032"
        }
    
    return preview_data

def subject_counts_formatting(subject_counts, total_questions): 
    # Format subject counts as a string, in descending order
    sorted_subjects = sorted(subject_counts.items(), key=lambda x: x[1], reverse=True)
    subject_counts_str = f"Total: {total_questions}\n"
    for subject, count in sorted_subjects:
        subject_counts_str += f"{subject}: {count}\n"
    subject_counts_str = subject_counts_str.strip()
    return subject_counts_str

    

def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
    """
    Format the preview data with improved readability for display in Gradio
    
    Args:
        preview_data (Dict[str, Any]): Dataset preview information
        
    Returns:
        pd.DataFrame: Formatted data for display
    """
    # Create links with bullet points
    links_value = (
        f"Dataset: {preview_data['links']['huggingface']}\n"
        f"GitHub: {preview_data['links']['github']}\n"
        f"Paper: {preview_data['links']['paper']}"
    )
    links_formatted = "• " + "\n• ".join(links_value.split('\n'))
    
    # Create a table format with better column names
    rows = [
        {"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]},
        {"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]},
        {"Dataset Property": "Description", "Details": preview_data["description"]},
        {"Dataset Property": "Links", "Details": links_formatted},
        {"Dataset Property": "Organization", "Details": preview_data["organization"]},
        {"Dataset Property": "Number of Questions", "Details": subject_counts_formatting(preview_data["subject_counts"],preview_data["total_questions"])},
        {"Dataset Property": "Choices per Question", "Details": preview_data["choices_per_question"]}
    ]
    
    return pd.DataFrame(rows)

# For standalone testing
if __name__ == "__main__":
    preview_data = mmlupro_dataset_preview()
    print("Preview data generated:")
    for key, value in preview_data.items():
        if key != "links":
            print(f"\n{key}:\n{value}")