File size: 3,481 Bytes
a5c9e62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
#!/usr/bin/env python3
"""
Test script to verify the agent's answer formatting works correctly.
"""
import os
from agent import smart_agent
from utils import format_gaia_answer
def test_answer_formatting():
"""Test the answer formatting function with various inputs."""
test_cases = [
# Test case: (raw_answer, expected_format)
("I think the answer is 42. FINAL ANSWER: 42", "42"),
("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
("After research, FINAL ANSWER: New York", "New York"),
("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
("No clear final answer format here", "No clear final answer format here"), # Fallback
]
print("π§ͺ Testing answer formatting...")
for i, (raw, expected) in enumerate(test_cases, 1):
result = format_gaia_answer(raw)
status = "β
" if result == expected else "β"
print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
if result != expected:
print(f" β οΈ Mismatch detected!")
print("\n" + "="*50)
def test_simple_question():
"""Test the agent with a simple question."""
print("π€ Testing agent with a simple question...")
question = "What is 2 + 2?"
try:
answer, reasoning = smart_agent(question)
print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Reasoning length: {len(reasoning)} characters")
print(f"Raw reasoning preview: {reasoning[:200]}...")
# Check if answer follows expected format
if answer and answer.strip():
print("β
Agent returned a non-empty answer")
else:
print("β Agent returned empty answer")
except Exception as e:
print(f"β Error testing agent: {e}")
print("\n" + "="*50)
def test_api_format():
"""Test that our submission format matches API expectations."""
print("π‘ Testing API submission format...")
# Simulate what would be sent to the API
sample_submission = {
"task_id": "test_task_1",
"model_answer": "42",
"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
}
required_fields = ["task_id", "model_answer"]
optional_fields = ["reasoning_trace"]
print("Required fields check:")
for field in required_fields:
if field in sample_submission:
print(f"β
{field}: {sample_submission[field]}")
else:
print(f"β Missing required field: {field}")
print("Optional fields check:")
for field in optional_fields:
if field in sample_submission:
print(f"β
{field}: Present ({len(str(sample_submission[field]))} chars)")
else:
print(f"βΉοΈ Optional field not present: {field}")
if __name__ == "__main__":
print("π§ GAIA Agent Format Testing")
print("="*50)
# Test 1: Answer formatting
test_answer_formatting()
# Test 2: Simple agent question
test_simple_question()
# Test 3: API format
test_api_format()
print("π Testing complete!")
|