File size: 5,482 Bytes
b121170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import json
import argparse
import os

# TEST WITH
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json

def load_json(filepath):
    """Loads JSON data from a file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {filepath}")
        return None

def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
    """
    Evaluates agent answers against ground truth.

    Args:
        questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
        agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
        level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.

    Returns:
        tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
               incorrect_details is a list of tuples: (task_id, expected, got)
    """
    correct_count = 0
    total_evaluated = 0
    incorrect_details = []
    agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}

    for task_id, question_info in questions_data.items():
        # Apply level filter if specified
        if level_filter is not None and question_info.get('Level') != level_filter:
            continue

        if task_id in agent_answers_map:
            total_evaluated += 1
            expected_answer = question_info.get('Final Answer')
            submitted_answer = agent_answers_map[task_id]

            # GAIA uses exact match
            if str(submitted_answer) == str(expected_answer):
                correct_count += 1
            else:
                incorrect_details.append((task_id, expected_answer, submitted_answer))
        # else:
        #     print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning

    accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
    return accuracy, correct_count, total_evaluated, incorrect_details

def main():
    parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
    parser.add_argument(
        "--questions_file",
        type=str,
        default="../question_set/new_gaia_questions.json", # Adjusted default path
        help="Path to the JSON file containing GAIA questions and answers."
    )
    parser.add_argument(
        "--answers_file",
        type=str,
        required=True,
        help="Path to the JSON file containing the agent's submitted answers."
    )
    parser.add_argument(
        "--level",
        type=int,
        choices=[1, 2, 3],
        default=None, # Default is None, meaning evaluate all levels
        help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
    )
    parser.add_argument(
        "--verbose",
        action='store_true', # Add verbose flag
        help="Print details of incorrect answers."
    )


    args = parser.parse_args()

    # Construct absolute paths relative to the script location
    script_dir = os.path.dirname(__file__)
    questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
    answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities

    print(f"Loading questions from: {questions_filepath}")
    questions_data = load_json(questions_filepath)
    if questions_data is None:
        return

    print(f"Loading agent answers from: {answers_filepath}")
    agent_answers_data = load_json(answers_filepath)
    if agent_answers_data is None:
        return

    # Ensure agent_answers_data is a list
    if not isinstance(agent_answers_data, list):
         print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
         # Attempt to load if it's a dict containing a list (common mistake)
         if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
             agent_answers_data = agent_answers_data['answers']
             print("Note: Loaded answers from the 'answers' key in the JSON object.")
         else:
            return


    level_str = f"Level {args.level}" if args.level else "All Levels"
    print(f"\nEvaluating answers for: {level_str}")

    accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
        questions_data, agent_answers_data, args.level
    )

    if total_evaluated == 0:
        print("No answers found for the specified criteria.")
    else:
        print("\n--- Evaluation Results ---")
        print(f"Level Filter: {level_str}")
        print(f"Total Questions Evaluated: {total_evaluated}")
        print(f"Correct Answers: {correct_count}")
        print(f"Accuracy: {accuracy:.2f}%")

        if args.verbose and incorrect_details:
            print("\n--- Incorrect Answers ---")
            for task_id, expected, got in incorrect_details:
                print(f"  Task ID: {task_id}")
                print(f"    Expected: {expected}")
                print(f"    Got:      {got}")
            print("------------------------")

if __name__ == "__main__":
    main()