Yago Bolivar commited on
Commit
b121170
·
1 Parent(s): 4d7d7f8

feat: add evaluation and submission utilities for GAIA project

Browse files
utilities/{compare_questions.py → compare_question_set.py} RENAMED
File without changes
utilities/evaluate_local.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ import os
4
+
5
+ # TEST WITH
6
+ # python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
7
+
8
+ def load_json(filepath):
9
+ """Loads JSON data from a file."""
10
+ try:
11
+ with open(filepath, 'r', encoding='utf-8') as f:
12
+ return json.load(f)
13
+ except FileNotFoundError:
14
+ print(f"Error: File not found at {filepath}")
15
+ return None
16
+ except json.JSONDecodeError:
17
+ print(f"Error: Could not decode JSON from {filepath}")
18
+ return None
19
+
20
+ def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
21
+ """
22
+ Evaluates agent answers against ground truth.
23
+
24
+ Args:
25
+ questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
26
+ agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
27
+ level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
28
+
29
+ Returns:
30
+ tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
31
+ incorrect_details is a list of tuples: (task_id, expected, got)
32
+ """
33
+ correct_count = 0
34
+ total_evaluated = 0
35
+ incorrect_details = []
36
+ agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
37
+
38
+ for task_id, question_info in questions_data.items():
39
+ # Apply level filter if specified
40
+ if level_filter is not None and question_info.get('Level') != level_filter:
41
+ continue
42
+
43
+ if task_id in agent_answers_map:
44
+ total_evaluated += 1
45
+ expected_answer = question_info.get('Final Answer')
46
+ submitted_answer = agent_answers_map[task_id]
47
+
48
+ # GAIA uses exact match
49
+ if str(submitted_answer) == str(expected_answer):
50
+ correct_count += 1
51
+ else:
52
+ incorrect_details.append((task_id, expected_answer, submitted_answer))
53
+ # else:
54
+ # print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
55
+
56
+ accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
57
+ return accuracy, correct_count, total_evaluated, incorrect_details
58
+
59
+ def main():
60
+ parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
61
+ parser.add_argument(
62
+ "--questions_file",
63
+ type=str,
64
+ default="../question_set/new_gaia_questions.json", # Adjusted default path
65
+ help="Path to the JSON file containing GAIA questions and answers."
66
+ )
67
+ parser.add_argument(
68
+ "--answers_file",
69
+ type=str,
70
+ required=True,
71
+ help="Path to the JSON file containing the agent's submitted answers."
72
+ )
73
+ parser.add_argument(
74
+ "--level",
75
+ type=int,
76
+ choices=[1, 2, 3],
77
+ default=None, # Default is None, meaning evaluate all levels
78
+ help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
79
+ )
80
+ parser.add_argument(
81
+ "--verbose",
82
+ action='store_true', # Add verbose flag
83
+ help="Print details of incorrect answers."
84
+ )
85
+
86
+
87
+ args = parser.parse_args()
88
+
89
+ # Construct absolute paths relative to the script location
90
+ script_dir = os.path.dirname(__file__)
91
+ questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
92
+ answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
93
+
94
+ print(f"Loading questions from: {questions_filepath}")
95
+ questions_data = load_json(questions_filepath)
96
+ if questions_data is None:
97
+ return
98
+
99
+ print(f"Loading agent answers from: {answers_filepath}")
100
+ agent_answers_data = load_json(answers_filepath)
101
+ if agent_answers_data is None:
102
+ return
103
+
104
+ # Ensure agent_answers_data is a list
105
+ if not isinstance(agent_answers_data, list):
106
+ print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
107
+ # Attempt to load if it's a dict containing a list (common mistake)
108
+ if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
109
+ agent_answers_data = agent_answers_data['answers']
110
+ print("Note: Loaded answers from the 'answers' key in the JSON object.")
111
+ else:
112
+ return
113
+
114
+
115
+ level_str = f"Level {args.level}" if args.level else "All Levels"
116
+ print(f"\nEvaluating answers for: {level_str}")
117
+
118
+ accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
119
+ questions_data, agent_answers_data, args.level
120
+ )
121
+
122
+ if total_evaluated == 0:
123
+ print("No answers found for the specified criteria.")
124
+ else:
125
+ print("\n--- Evaluation Results ---")
126
+ print(f"Level Filter: {level_str}")
127
+ print(f"Total Questions Evaluated: {total_evaluated}")
128
+ print(f"Correct Answers: {correct_count}")
129
+ print(f"Accuracy: {accuracy:.2f}%")
130
+
131
+ if args.verbose and incorrect_details:
132
+ print("\n--- Incorrect Answers ---")
133
+ for task_id, expected, got in incorrect_details:
134
+ print(f" Task ID: {task_id}")
135
+ print(f" Expected: {expected}")
136
+ print(f" Got: {got}")
137
+ print("------------------------")
138
+
139
+ if __name__ == "__main__":
140
+ main()
utilities/evaluate_local_commands.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ **Run the Evaluation Script:** Open your terminal, navigate to the `utilities` directory, and run the script:
2
+
3
+ * **Evaluate all levels:**
4
+ ```bash
5
+ cd /Users/yagoairm2/Desktop/agents/final\ project/HF_Agents_Final_Project/utilities
6
+ python evaluate_local.py --answers_file ../agent_answers.json
7
+ ```
8
+ * **Evaluate only Level 1:**
9
+ ```bash
10
+ python evaluate_local.py --answers_file ../agent_answers.json --level 1
11
+ ```
12
+ * **Evaluate Level 1 and show incorrect answers:**
13
+ ```bash
14
+ python evaluate_local.py --answers_file ../agent_answers.json --level 1 --verbose
15
+ ```
16
+
17
+ This script will calculate and print the accuracy based on the exact match criterion used by GAIA, without submitting anything to the official leaderboard.
utilities/{random_question_answer.py → random_question_submit.py} RENAMED
File without changes