File size: 5,482 Bytes
b121170 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import json
import argparse
import os
# TEST WITH
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
def load_json(filepath):
"""Loads JSON data from a file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File not found at {filepath}")
return None
except json.JSONDecodeError:
print(f"Error: Could not decode JSON from {filepath}")
return None
def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
"""
Evaluates agent answers against ground truth.
Args:
questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
Returns:
tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
incorrect_details is a list of tuples: (task_id, expected, got)
"""
correct_count = 0
total_evaluated = 0
incorrect_details = []
agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
for task_id, question_info in questions_data.items():
# Apply level filter if specified
if level_filter is not None and question_info.get('Level') != level_filter:
continue
if task_id in agent_answers_map:
total_evaluated += 1
expected_answer = question_info.get('Final Answer')
submitted_answer = agent_answers_map[task_id]
# GAIA uses exact match
if str(submitted_answer) == str(expected_answer):
correct_count += 1
else:
incorrect_details.append((task_id, expected_answer, submitted_answer))
# else:
# print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
return accuracy, correct_count, total_evaluated, incorrect_details
def main():
parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
parser.add_argument(
"--questions_file",
type=str,
default="../question_set/new_gaia_questions.json", # Adjusted default path
help="Path to the JSON file containing GAIA questions and answers."
)
parser.add_argument(
"--answers_file",
type=str,
required=True,
help="Path to the JSON file containing the agent's submitted answers."
)
parser.add_argument(
"--level",
type=int,
choices=[1, 2, 3],
default=None, # Default is None, meaning evaluate all levels
help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
)
parser.add_argument(
"--verbose",
action='store_true', # Add verbose flag
help="Print details of incorrect answers."
)
args = parser.parse_args()
# Construct absolute paths relative to the script location
script_dir = os.path.dirname(__file__)
questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
print(f"Loading questions from: {questions_filepath}")
questions_data = load_json(questions_filepath)
if questions_data is None:
return
print(f"Loading agent answers from: {answers_filepath}")
agent_answers_data = load_json(answers_filepath)
if agent_answers_data is None:
return
# Ensure agent_answers_data is a list
if not isinstance(agent_answers_data, list):
print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
# Attempt to load if it's a dict containing a list (common mistake)
if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
agent_answers_data = agent_answers_data['answers']
print("Note: Loaded answers from the 'answers' key in the JSON object.")
else:
return
level_str = f"Level {args.level}" if args.level else "All Levels"
print(f"\nEvaluating answers for: {level_str}")
accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
questions_data, agent_answers_data, args.level
)
if total_evaluated == 0:
print("No answers found for the specified criteria.")
else:
print("\n--- Evaluation Results ---")
print(f"Level Filter: {level_str}")
print(f"Total Questions Evaluated: {total_evaluated}")
print(f"Correct Answers: {correct_count}")
print(f"Accuracy: {accuracy:.2f}%")
if args.verbose and incorrect_details:
print("\n--- Incorrect Answers ---")
for task_id, expected, got in incorrect_details:
print(f" Task ID: {task_id}")
print(f" Expected: {expected}")
print(f" Got: {got}")
print("------------------------")
if __name__ == "__main__":
main() |