Yago Bolivar
commited on
Commit
·
b121170
1
Parent(s):
4d7d7f8
feat: add evaluation and submission utilities for GAIA project
Browse files
utilities/{compare_questions.py → compare_question_set.py}
RENAMED
File without changes
|
utilities/evaluate_local.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import argparse
|
3 |
+
import os
|
4 |
+
|
5 |
+
# TEST WITH
|
6 |
+
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
|
7 |
+
|
8 |
+
def load_json(filepath):
|
9 |
+
"""Loads JSON data from a file."""
|
10 |
+
try:
|
11 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
12 |
+
return json.load(f)
|
13 |
+
except FileNotFoundError:
|
14 |
+
print(f"Error: File not found at {filepath}")
|
15 |
+
return None
|
16 |
+
except json.JSONDecodeError:
|
17 |
+
print(f"Error: Could not decode JSON from {filepath}")
|
18 |
+
return None
|
19 |
+
|
20 |
+
def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
|
21 |
+
"""
|
22 |
+
Evaluates agent answers against ground truth.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
|
26 |
+
agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
|
27 |
+
level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
|
31 |
+
incorrect_details is a list of tuples: (task_id, expected, got)
|
32 |
+
"""
|
33 |
+
correct_count = 0
|
34 |
+
total_evaluated = 0
|
35 |
+
incorrect_details = []
|
36 |
+
agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
|
37 |
+
|
38 |
+
for task_id, question_info in questions_data.items():
|
39 |
+
# Apply level filter if specified
|
40 |
+
if level_filter is not None and question_info.get('Level') != level_filter:
|
41 |
+
continue
|
42 |
+
|
43 |
+
if task_id in agent_answers_map:
|
44 |
+
total_evaluated += 1
|
45 |
+
expected_answer = question_info.get('Final Answer')
|
46 |
+
submitted_answer = agent_answers_map[task_id]
|
47 |
+
|
48 |
+
# GAIA uses exact match
|
49 |
+
if str(submitted_answer) == str(expected_answer):
|
50 |
+
correct_count += 1
|
51 |
+
else:
|
52 |
+
incorrect_details.append((task_id, expected_answer, submitted_answer))
|
53 |
+
# else:
|
54 |
+
# print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
|
55 |
+
|
56 |
+
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
|
57 |
+
return accuracy, correct_count, total_evaluated, incorrect_details
|
58 |
+
|
59 |
+
def main():
|
60 |
+
parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
|
61 |
+
parser.add_argument(
|
62 |
+
"--questions_file",
|
63 |
+
type=str,
|
64 |
+
default="../question_set/new_gaia_questions.json", # Adjusted default path
|
65 |
+
help="Path to the JSON file containing GAIA questions and answers."
|
66 |
+
)
|
67 |
+
parser.add_argument(
|
68 |
+
"--answers_file",
|
69 |
+
type=str,
|
70 |
+
required=True,
|
71 |
+
help="Path to the JSON file containing the agent's submitted answers."
|
72 |
+
)
|
73 |
+
parser.add_argument(
|
74 |
+
"--level",
|
75 |
+
type=int,
|
76 |
+
choices=[1, 2, 3],
|
77 |
+
default=None, # Default is None, meaning evaluate all levels
|
78 |
+
help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
|
79 |
+
)
|
80 |
+
parser.add_argument(
|
81 |
+
"--verbose",
|
82 |
+
action='store_true', # Add verbose flag
|
83 |
+
help="Print details of incorrect answers."
|
84 |
+
)
|
85 |
+
|
86 |
+
|
87 |
+
args = parser.parse_args()
|
88 |
+
|
89 |
+
# Construct absolute paths relative to the script location
|
90 |
+
script_dir = os.path.dirname(__file__)
|
91 |
+
questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
|
92 |
+
answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
|
93 |
+
|
94 |
+
print(f"Loading questions from: {questions_filepath}")
|
95 |
+
questions_data = load_json(questions_filepath)
|
96 |
+
if questions_data is None:
|
97 |
+
return
|
98 |
+
|
99 |
+
print(f"Loading agent answers from: {answers_filepath}")
|
100 |
+
agent_answers_data = load_json(answers_filepath)
|
101 |
+
if agent_answers_data is None:
|
102 |
+
return
|
103 |
+
|
104 |
+
# Ensure agent_answers_data is a list
|
105 |
+
if not isinstance(agent_answers_data, list):
|
106 |
+
print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
|
107 |
+
# Attempt to load if it's a dict containing a list (common mistake)
|
108 |
+
if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
|
109 |
+
agent_answers_data = agent_answers_data['answers']
|
110 |
+
print("Note: Loaded answers from the 'answers' key in the JSON object.")
|
111 |
+
else:
|
112 |
+
return
|
113 |
+
|
114 |
+
|
115 |
+
level_str = f"Level {args.level}" if args.level else "All Levels"
|
116 |
+
print(f"\nEvaluating answers for: {level_str}")
|
117 |
+
|
118 |
+
accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
|
119 |
+
questions_data, agent_answers_data, args.level
|
120 |
+
)
|
121 |
+
|
122 |
+
if total_evaluated == 0:
|
123 |
+
print("No answers found for the specified criteria.")
|
124 |
+
else:
|
125 |
+
print("\n--- Evaluation Results ---")
|
126 |
+
print(f"Level Filter: {level_str}")
|
127 |
+
print(f"Total Questions Evaluated: {total_evaluated}")
|
128 |
+
print(f"Correct Answers: {correct_count}")
|
129 |
+
print(f"Accuracy: {accuracy:.2f}%")
|
130 |
+
|
131 |
+
if args.verbose and incorrect_details:
|
132 |
+
print("\n--- Incorrect Answers ---")
|
133 |
+
for task_id, expected, got in incorrect_details:
|
134 |
+
print(f" Task ID: {task_id}")
|
135 |
+
print(f" Expected: {expected}")
|
136 |
+
print(f" Got: {got}")
|
137 |
+
print("------------------------")
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
main()
|
utilities/evaluate_local_commands.md
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**Run the Evaluation Script:** Open your terminal, navigate to the `utilities` directory, and run the script:
|
2 |
+
|
3 |
+
* **Evaluate all levels:**
|
4 |
+
```bash
|
5 |
+
cd /Users/yagoairm2/Desktop/agents/final\ project/HF_Agents_Final_Project/utilities
|
6 |
+
python evaluate_local.py --answers_file ../agent_answers.json
|
7 |
+
```
|
8 |
+
* **Evaluate only Level 1:**
|
9 |
+
```bash
|
10 |
+
python evaluate_local.py --answers_file ../agent_answers.json --level 1
|
11 |
+
```
|
12 |
+
* **Evaluate Level 1 and show incorrect answers:**
|
13 |
+
```bash
|
14 |
+
python evaluate_local.py --answers_file ../agent_answers.json --level 1 --verbose
|
15 |
+
```
|
16 |
+
|
17 |
+
This script will calculate and print the accuracy based on the exact match criterion used by GAIA, without submitting anything to the official leaderboard.
|
utilities/{random_question_answer.py → random_question_submit.py}
RENAMED
File without changes
|