File size: 3,481 Bytes
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""
Test script to verify the agent's answer formatting works correctly.
"""

import os
from agent import smart_agent
from utils import format_gaia_answer

def test_answer_formatting():
    """Test the answer formatting function with various inputs."""
    
    test_cases = [
        # Test case: (raw_answer, expected_format)
        ("I think the answer is 42. FINAL ANSWER: 42", "42"),
        ("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
        ("After research, FINAL ANSWER: New York", "New York"),
        ("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
        ("FINAL ANSWER: 1,234", "1234"),  # Should remove commas from numbers
        ("FINAL ANSWER: \"Hello World\"", "Hello World"),  # Should remove quotes
        ("FINAL ANSWER: approximately 100", "100"),  # Should remove qualifiers
        ("No clear final answer format here", "No clear final answer format here"),  # Fallback
    ]
    
    print("πŸ§ͺ Testing answer formatting...")
    for i, (raw, expected) in enumerate(test_cases, 1):
        result = format_gaia_answer(raw)
        status = "βœ…" if result == expected else "❌"
        print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
        if result != expected:
            print(f"   ⚠️  Mismatch detected!")
    
    print("\n" + "="*50)

def test_simple_question():
    """Test the agent with a simple question."""
    print("πŸ€– Testing agent with a simple question...")
    
    question = "What is 2 + 2?"
    try:
        answer, reasoning = smart_agent(question)
        print(f"Question: {question}")
        print(f"Answer: {answer}")
        print(f"Reasoning length: {len(reasoning)} characters")
        print(f"Raw reasoning preview: {reasoning[:200]}...")
        
        # Check if answer follows expected format
        if answer and answer.strip():
            print("βœ… Agent returned a non-empty answer")
        else:
            print("❌ Agent returned empty answer")
            
    except Exception as e:
        print(f"❌ Error testing agent: {e}")
    
    print("\n" + "="*50)

def test_api_format():
    """Test that our submission format matches API expectations."""
    print("πŸ“‘ Testing API submission format...")
    
    # Simulate what would be sent to the API
    sample_submission = {
        "task_id": "test_task_1",
        "model_answer": "42",
        "reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
    }
    
    required_fields = ["task_id", "model_answer"]
    optional_fields = ["reasoning_trace"]
    
    print("Required fields check:")
    for field in required_fields:
        if field in sample_submission:
            print(f"βœ… {field}: {sample_submission[field]}")
        else:
            print(f"❌ Missing required field: {field}")
    
    print("Optional fields check:")
    for field in optional_fields:
        if field in sample_submission:
            print(f"βœ… {field}: Present ({len(str(sample_submission[field]))} chars)")
        else:
            print(f"ℹ️  Optional field not present: {field}")

if __name__ == "__main__":
    print("πŸ”§ GAIA Agent Format Testing")
    print("="*50)
    
    # Test 1: Answer formatting
    test_answer_formatting()
    
    # Test 2: Simple agent question
    test_simple_question()
    
    # Test 3: API format
    test_api_format()
    
    print("🏁 Testing complete!")