HF_Agents_Final_Project

Runtime error

Yago Bolivar commited on Apr 30

Commit

b121170

1 Parent(s): 4d7d7f8

feat: add evaluation and submission utilities for GAIA project

Browse files

Files changed (4) hide show

utilities/{compare_questions.py → compare_question_set.py} +0 -0
utilities/evaluate_local.py +140 -0
utilities/evaluate_local_commands.md +17 -0
utilities/{random_question_answer.py → random_question_submit.py} +0 -0

utilities/{compare_questions.py → compare_question_set.py} RENAMED Viewed

File without changes

utilities/evaluate_local.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import argparse
+import os
+# TEST WITH
+# python3 utilities/evaluate_local.py --answers_file ./question_set/agent_answers.json
+def load_json(filepath):
+    """Loads JSON data from a file."""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: File not found at {filepath}")
+        return None
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from {filepath}")
+        return None
+def evaluate_answers(questions_data, agent_answers_data, level_filter=None):
+    """
+    Evaluates agent answers against ground truth.
+    Args:
+        questions_data (dict): Dictionary mapping task_id to question details including 'Final Answer'.
+        agent_answers_data (list): List of dictionaries with 'task_id' and 'submitted_answer'.
+        level_filter (int, optional): Filter evaluation to only this GAIA level. Defaults to None.
+    Returns:
+        tuple: (accuracy, correct_count, total_evaluated, incorrect_details)
+               incorrect_details is a list of tuples: (task_id, expected, got)
+    """
+    correct_count = 0
+    total_evaluated = 0
+    incorrect_details = []
+    agent_answers_map = {item['task_id']: item['submitted_answer'] for item in agent_answers_data}
+    for task_id, question_info in questions_data.items():
+        # Apply level filter if specified
+        if level_filter is not None and question_info.get('Level') != level_filter:
+            continue
+        if task_id in agent_answers_map:
+            total_evaluated += 1
+            expected_answer = question_info.get('Final Answer')
+            submitted_answer = agent_answers_map[task_id]
+            # GAIA uses exact match
+            if str(submitted_answer) == str(expected_answer):
+                correct_count += 1
+            else:
+                incorrect_details.append((task_id, expected_answer, submitted_answer))
+        # else:
+        #     print(f"Warning: No submitted answer found for task_id {task_id}") # Optional warning
+    accuracy = (correct_count / total_evaluated) * 100 if total_evaluated > 0 else 0
+    return accuracy, correct_count, total_evaluated, incorrect_details
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate agent answers locally against GAIA ground truth.")
+    parser.add_argument(
+        "--questions_file",
+        type=str,
+        default="../question_set/new_gaia_questions.json", # Adjusted default path
+        help="Path to the JSON file containing GAIA questions and answers."
+    )
+    parser.add_argument(
+        "--answers_file",
+        type=str,
+        required=True,
+        help="Path to the JSON file containing the agent's submitted answers."
+    )
+    parser.add_argument(
+        "--level",
+        type=int,
+        choices=[1, 2, 3],
+        default=None, # Default is None, meaning evaluate all levels
+        help="Specify the GAIA level (1, 2, or 3) to evaluate. Evaluates all levels if not specified."
+    )
+    parser.add_argument(
+        "--verbose",
+        action='store_true', # Add verbose flag
+        help="Print details of incorrect answers."
+    )
+    args = parser.parse_args()
+    # Construct absolute paths relative to the script location
+    script_dir = os.path.dirname(__file__)
+    questions_filepath = os.path.abspath(os.path.join(script_dir, args.questions_file))
+    answers_filepath = os.path.abspath(os.path.join(script_dir, '..', args.answers_file)) # Assume answers file is in root relative to script in utilities
+    print(f"Loading questions from: {questions_filepath}")
+    questions_data = load_json(questions_filepath)
+    if questions_data is None:
+        return
+    print(f"Loading agent answers from: {answers_filepath}")
+    agent_answers_data = load_json(answers_filepath)
+    if agent_answers_data is None:
+        return
+    # Ensure agent_answers_data is a list
+    if not isinstance(agent_answers_data, list):
+         print(f"Error: Agent answers file ({args.answers_file}) should contain a JSON list.")
+         # Attempt to load if it's a dict containing a list (common mistake)
+         if isinstance(agent_answers_data, dict) and 'answers' in agent_answers_data and isinstance(agent_answers_data['answers'], list):
+             agent_answers_data = agent_answers_data['answers']
+             print("Note: Loaded answers from the 'answers' key in the JSON object.")
+         else:
+            return
+    level_str = f"Level {args.level}" if args.level else "All Levels"
+    print(f"\nEvaluating answers for: {level_str}")
+    accuracy, correct_count, total_evaluated, incorrect_details = evaluate_answers(
+        questions_data, agent_answers_data, args.level
+    )
+    if total_evaluated == 0:
+        print("No answers found for the specified criteria.")
+    else:
+        print("\n--- Evaluation Results ---")
+        print(f"Level Filter: {level_str}")
+        print(f"Total Questions Evaluated: {total_evaluated}")
+        print(f"Correct Answers: {correct_count}")
+        print(f"Accuracy: {accuracy:.2f}%")
+        if args.verbose and incorrect_details:
+            print("\n--- Incorrect Answers ---")
+            for task_id, expected, got in incorrect_details:
+                print(f"  Task ID: {task_id}")
+                print(f"    Expected: {expected}")
+                print(f"    Got:      {got}")
+            print("------------------------")
+if __name__ == "__main__":
+    main()

utilities/evaluate_local_commands.md ADDED Viewed

	@@ -0,0 +1,17 @@

+**Run the Evaluation Script:** Open your terminal, navigate to the `utilities` directory, and run the script:
+    *   **Evaluate all levels:**
+        ```bash
+        cd /Users/yagoairm2/Desktop/agents/final\ project/HF_Agents_Final_Project/utilities
+        python evaluate_local.py --answers_file ../agent_answers.json
+        ```
+    *   **Evaluate only Level 1:**
+        ```bash
+        python evaluate_local.py --answers_file ../agent_answers.json --level 1
+        ```
+    *   **Evaluate Level 1 and show incorrect answers:**
+        ```bash
+        python evaluate_local.py --answers_file ../agent_answers.json --level 1 --verbose
+        ```
+This script will calculate and print the accuracy based on the exact match criterion used by GAIA, without submitting anything to the official leaderboard.

utilities/{random_question_answer.py → random_question_submit.py} RENAMED Viewed

File without changes