|
import json |
|
import argparse |
|
import os |
|
|
|
|
|
|
|
|
|
def load_json(filepath): |
|
"""Loads JSON data from a file.""" |
|
try: |
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
except FileNotFoundError: |
|
print(f"Error: File not found at {filepath}") |
|
return None |
|
except json.JSONDecodeError: |
|
print(f"Error: Could not decode JSON from {filepath}") |
|
return None |
|
|
|
def evaluate_answers(questions_data, agent_answers_data, level_filter=None): |
|
""" |
|
Evaluates agent answers against ground truth. |
|
|
|
Args: |
|
questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'. |
|
agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'. |
|
level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None. |
|
|
|
Returns: |
|
tuple: (accuracy, correct_count, total_evaluated, incorrect_details) |
|
incorrect_details is a list of tuples: (task_id, expected, got) |
|
""" |
|
correct_count = 0 |
|
total_evaluated = 0 |
|
incorrect_details = [] |
|
agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data} |
|
|
|
for task_id, question_info in questions_data.items(): |
|
|
|
if level_filter is not None and question_info.get('Level') != level_filter: |
|
continue |
|
|
|
if task_id in agent_answers_map: |
|
total_evaluated += 1 |
|
expected_answer = question_info.get('Final Answer') |
|
submitted_answer = agent_answers_map[task_id] |
|
|
|
|
|
if str(submitted_answer) == str(expected_answer): |
|
correct_count += 1 |
|
else: |
|
incorrect_details.append((task_id, expected_answer, submitted_answer)) |
|
|
|
|
|
|
|
accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0 |
|
return accuracy, correct_count, total_evaluated, incorrect_details |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.") |
|
parser.add_argument( |
|
"--questions_file", |
|
type=str, |
|
default="../question_set/new_gaia_questions.json", |
|
help="Path to the JSON file containing GAIA questions and answers." |
|
) |
|
parser.add_argument( |
|
"--answers_file", |
|
type=str, |
|
required=True, |
|
help="Path to the JSON file containing the agent's submitted answers." |
|
) |
|
parser.add_argument( |
|
"--level", |
|
type=int, |
|
choices=[1, 2, 3], |
|
default=None, |
|
help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified." |
|
) |
|
parser.add_argument( |
|
"--verbose", |
|
action='store_true', |
|
help="Print details of incorrect answers." |
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
script_dir = os.path.dirname(__file__) |
|
questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file)) |
|
answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) |
|
|
|
print(f"Loading questions from: {questions_filepath}") |
|
questions_data = load_json(questions_filepath) |
|
if questions_data is None: |
|
return |
|
|
|
print(f"Loading agent answers from: {answers_filepath}") |
|
agent_answers_data = load_json(answers_filepath) |
|
if agent_answers_data is None: |
|
return |
|
|
|
|
|
if not isinstance(agent_answers_data, list): |
|
print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.") |
|
|
|
if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list): |
|
agent_answers_data = agent_answers_data['answers'] |
|
print("Note: Loaded answers from the 'answers' key in the JSON object.") |
|
else: |
|
return |
|
|
|
|
|
level_str = f"Level {args.level}" if args.level else "All Levels" |
|
print(f"\nEvaluating answers for: {level_str}") |
|
|
|
accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers( |
|
questions_data, agent_answers_data, args.level |
|
) |
|
|
|
if total_evaluated == 0: |
|
print("No answers found for the specified criteria.") |
|
else: |
|
print("\n--- Evaluation Results ---") |
|
print(f"Level Filter: {level_str}") |
|
print(f"Total Questions Evaluated: {total_evaluated}") |
|
print(f"Correct Answers: {correct_count}") |
|
print(f"Accuracy: {accuracy:.2f}%") |
|
|
|
if args.verbose and incorrect_details: |
|
print("\n--- Incorrect Answers ---") |
|
for task_id, expected, got in incorrect_details: |
|
print(f" Task ID: {task_id}") |
|
print(f" Expected: {expected}") |
|
print(f" Got: {got}") |
|
print("------------------------") |
|
|
|
if __name__ == "__main__": |
|
main() |