HF_Agents_Final_Project

Sleeping

File size: 5,482 Bytes

b121170

import json
import argparse
import os

# TEST WITH
# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json

def load_json(filepath):
    """Loads JSON data from a file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Could not decode JSON from {filepath}")
        return None

def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
    """
    Evaluates agent answers against ground truth.

    Args:
        questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
        agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
        level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.

    Returns:
        tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
               incorrect_details is a list of tuples: (task_id, expected, got)
    """
    correct_count = 0
    total_evaluated = 0
    incorrect_details = []
    agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}

    for task_id, question_info in questions_data.items():
        # Apply level filter if specified
        if level_filter is not None and question_info.get('Level') != level_filter:
            continue

        if task_id in agent_answers_map:
            total_evaluated += 1
            expected_answer = question_info.get('Final Answer')
            submitted_answer = agent_answers_map[task_id]

            # GAIA uses exact match
            if str(submitted_answer) == str(expected_answer):
                correct_count += 1
            else:
                incorrect_details.append((task_id, expected_answer, submitted_answer))
        # else:
        #     print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning

    accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
    return accuracy, correct_count, total_evaluated, incorrect_details

def main():
    parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
    parser.add_argument(
        "--questions_file",
        type=str,
        default="../question_set/new_gaia_questions.json", # Adjusted default path
        help="Path to the JSON file containing GAIA questions and answers."
    )
    parser.add_argument(
        "--answers_file",
        type=str,
        required=True,
        help="Path to the JSON file containing the agent's submitted answers."
    )
    parser.add_argument(
        "--level",
        type=int,
        choices=[1, 2, 3],
        default=None, # Default is None, meaning evaluate all levels
        help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
    )
    parser.add_argument(
        "--verbose",
        action='store_true', # Add verbose flag
        help="Print details of incorrect answers."
    )


    args = parser.parse_args()

    # Construct absolute paths relative to the script location
    script_dir = os.path.dirname(__file__)
    questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
    answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities

    print(f"Loading questions from: {questions_filepath}")
    questions_data = load_json(questions_filepath)
    if questions_data is None:
        return

    print(f"Loading agent answers from: {answers_filepath}")
    agent_answers_data = load_json(answers_filepath)
    if agent_answers_data is None:
        return

    # Ensure agent_answers_data is a list
    if not isinstance(agent_answers_data, list):
         print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
         # Attempt to load if it's a dict containing a list (common mistake)
         if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
             agent_answers_data = agent_answers_data['answers']
             print("Note: Loaded answers from the 'answers' key in the JSON object.")
         else:
            return


    level_str = f"Level {args.level}" if args.level else "All Levels"
    print(f"\nEvaluating answers for: {level_str}")

    accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
        questions_data, agent_answers_data, args.level
    )

    if total_evaluated == 0:
        print("No answers found for the specified criteria.")
    else:
        print("\n--- Evaluation Results ---")
        print(f"Level Filter: {level_str}")
        print(f"Total Questions Evaluated: {total_evaluated}")
        print(f"Correct Answers: {correct_count}")
        print(f"Accuracy: {accuracy:.2f}%")

        if args.verbose and incorrect_details:
            print("\n--- Incorrect Answers ---")
            for task_id, expected, got in incorrect_details:
                print(f"  Task ID: {task_id}")
                print(f"    Expected: {expected}")
                print(f"    Got:      {got}")
            print("------------------------")

if __name__ == "__main__":
    main()