HF_Agents_Final_Project

Sleeping

HF_Agents_Final_Project / utilities /evaluate_local.py

Yago Bolivar

feat: add evaluation and submission utilities for GAIA project

b121170 4 months ago

5.48 kB

	import json
	import argparse
	import os

	# TEST WITH
	# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json

	def load_json(filepath):
	"""Loads JSON data from a file."""
	try:
	with open(filepath, 'r', encoding='utf-8') as f:
	return json.load(f)
	except FileNotFoundError:
	print(f"Error: File not found at {filepath}")
	return None
	except json.JSONDecodeError:
	print(f"Error: Could not decode JSON from {filepath}")
	return None

	def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
	"""
	Evaluates agent answers against ground truth.

	Args:
	questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
	agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
	level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.

	Returns:
	tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
	incorrect_details is a list of tuples: (task_id, expected, got)
	"""
	correct_count = 0
	total_evaluated = 0
	incorrect_details = []
	agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}

	for task_id, question_info in questions_data.items():
	# Apply level filter if specified
	if level_filter is not None and question_info.get('Level') != level_filter:
	continue

	if task_id in agent_answers_map:
	total_evaluated += 1
	expected_answer = question_info.get('Final Answer')
	submitted_answer = agent_answers_map[task_id]

	# GAIA uses exact match
	if str(submitted_answer) == str(expected_answer):
	correct_count += 1
	else:
	incorrect_details.append((task_id, expected_answer, submitted_answer))
	# else:
	# print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning

	accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
	return accuracy, correct_count, total_evaluated, incorrect_details

	def main():
	parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
	parser.add_argument(
	"--questions_file",
	type=str,
	default="../question_set/new_gaia_questions.json", # Adjusted default path
	help="Path to the JSON file containing GAIA questions and answers."
	)
	parser.add_argument(
	"--answers_file",
	type=str,
	required=True,
	help="Path to the JSON file containing the agent's submitted answers."
	)
	parser.add_argument(
	"--level",
	type=int,
	choices=[1, 2, 3],
	default=None, # Default is None, meaning evaluate all levels
	help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
	)
	parser.add_argument(
	"--verbose",
	action='store_true', # Add verbose flag
	help="Print details of incorrect answers."
	)


	args = parser.parse_args()

	# Construct absolute paths relative to the script location
	script_dir = os.path.dirname(__file__)
	questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
	answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities

	print(f"Loading questions from: {questions_filepath}")
	questions_data = load_json(questions_filepath)
	if questions_data is None:
	return

	print(f"Loading agent answers from: {answers_filepath}")
	agent_answers_data = load_json(answers_filepath)
	if agent_answers_data is None:
	return

	# Ensure agent_answers_data is a list
	if not isinstance(agent_answers_data, list):
	print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
	# Attempt to load if it's a dict containing a list (common mistake)
	if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
	agent_answers_data = agent_answers_data['answers']
	print("Note: Loaded answers from the 'answers' key in the JSON object.")
	else:
	return


	level_str = f"Level {args.level}" if args.level else "All Levels"
	print(f"\nEvaluating answers for: {level_str}")

	accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
	questions_data, agent_answers_data, args.level
	)

	if total_evaluated == 0:
	print("No answers found for the specified criteria.")
	else:
	print("\n--- Evaluation Results ---")
	print(f"Level Filter: {level_str}")
	print(f"Total Questions Evaluated: {total_evaluated}")
	print(f"Correct Answers: {correct_count}")
	print(f"Accuracy: {accuracy:.2f}%")

	if args.verbose and incorrect_details:
	print("\n--- Incorrect Answers ---")
	for task_id, expected, got in incorrect_details:
	print(f" Task ID: {task_id}")
	print(f" Expected: {expected}")
	print(f" Got: {got}")
	print("------------------------")

	if __name__ == "__main__":
	main()