HF_Agents_Final_Project / utilities /evaluate_local.py
Yago Bolivar
feat: add evaluation and submission utilities for GAIA project
b121170
import json
import argparse
import os
# TEST WITH
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
def load_json(filepath):
"""Loads JSON data from a file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found at {filepath}")
return None
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {filepath}")
return None
def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
"""
Evaluates agent answers against ground truth.
Args:
questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
Returns:
tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
incorrect_details is a list of tuples: (task_id, expected, got)
"""
correct_count = 0
total_evaluated = 0
incorrect_details = []
agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
for task_id, question_info in questions_data.items():
# Apply level filter if specified
if level_filter is not None and question_info.get('Level') != level_filter:
continue
if task_id in agent_answers_map:
total_evaluated += 1
expected_answer = question_info.get('Final Answer')
submitted_answer = agent_answers_map[task_id]
# GAIA uses exact match
if str(submitted_answer) == str(expected_answer):
correct_count += 1
else:
incorrect_details.append((task_id, expected_answer, submitted_answer))
# else:
# print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
return accuracy, correct_count, total_evaluated, incorrect_details
def main():
parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
parser.add_argument(
"--questions_file",
type=str,
default="../question_set/new_gaia_questions.json", # Adjusted default path
help="Path to the JSON file containing GAIA questions and answers."
)
parser.add_argument(
"--answers_file",
type=str,
required=True,
help="Path to the JSON file containing the agent's submitted answers."
)
parser.add_argument(
"--level",
type=int,
choices=[1, 2, 3],
default=None, # Default is None, meaning evaluate all levels
help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
)
parser.add_argument(
"--verbose",
action='store_true', # Add verbose flag
help="Print details of incorrect answers."
)
args = parser.parse_args()
# Construct absolute paths relative to the script location
script_dir = os.path.dirname(__file__)
questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
print(f"Loading questions from: {questions_filepath}")
questions_data = load_json(questions_filepath)
if questions_data is None:
return
print(f"Loading agent answers from: {answers_filepath}")
agent_answers_data = load_json(answers_filepath)
if agent_answers_data is None:
return
# Ensure agent_answers_data is a list
if not isinstance(agent_answers_data, list):
print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
# Attempt to load if it's a dict containing a list (common mistake)
if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
agent_answers_data = agent_answers_data['answers']
print("Note: Loaded answers from the 'answers' key in the JSON object.")
else:
return
level_str = f"Level {args.level}" if args.level else "All Levels"
print(f"\nEvaluating answers for: {level_str}")
accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
questions_data, agent_answers_data, args.level
)
if total_evaluated == 0:
print("No answers found for the specified criteria.")
else:
print("\n--- Evaluation Results ---")
print(f"Level Filter: {level_str}")
print(f"Total Questions Evaluated: {total_evaluated}")
print(f"Correct Answers: {correct_count}")
print(f"Accuracy: {accuracy:.2f}%")
if args.verbose and incorrect_details:
print("\n--- Incorrect Answers ---")
for task_id, expected, got in incorrect_details:
print(f" Task ID: {task_id}")
print(f" Expected: {expected}")
print(f" Got: {got}")
print("------------------------")
if __name__ == "__main__":
main()