|
|
|
""" |
|
Test script to verify the agent's answer formatting works correctly. |
|
""" |
|
|
|
import os |
|
from agent import smart_agent |
|
from utils import format_gaia_answer |
|
|
|
def test_answer_formatting(): |
|
"""Test the answer formatting function with various inputs.""" |
|
|
|
test_cases = [ |
|
|
|
("I think the answer is 42. FINAL ANSWER: 42", "42"), |
|
("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"), |
|
("After research, FINAL ANSWER: New York", "New York"), |
|
("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"), |
|
("FINAL ANSWER: 1,234", "1234"), |
|
("FINAL ANSWER: \"Hello World\"", "Hello World"), |
|
("FINAL ANSWER: approximately 100", "100"), |
|
("No clear final answer format here", "No clear final answer format here"), |
|
] |
|
|
|
print("π§ͺ Testing answer formatting...") |
|
for i, (raw, expected) in enumerate(test_cases, 1): |
|
result = format_gaia_answer(raw) |
|
status = "β
" if result == expected else "β" |
|
print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')") |
|
if result != expected: |
|
print(f" β οΈ Mismatch detected!") |
|
|
|
print("\n" + "="*50) |
|
|
|
def test_simple_question(): |
|
"""Test the agent with a simple question.""" |
|
print("π€ Testing agent with a simple question...") |
|
|
|
question = "What is 2 + 2?" |
|
try: |
|
answer, reasoning = smart_agent(question) |
|
print(f"Question: {question}") |
|
print(f"Answer: {answer}") |
|
print(f"Reasoning length: {len(reasoning)} characters") |
|
print(f"Raw reasoning preview: {reasoning[:200]}...") |
|
|
|
|
|
if answer and answer.strip(): |
|
print("β
Agent returned a non-empty answer") |
|
else: |
|
print("β Agent returned empty answer") |
|
|
|
except Exception as e: |
|
print(f"β Error testing agent: {e}") |
|
|
|
print("\n" + "="*50) |
|
|
|
def test_api_format(): |
|
"""Test that our submission format matches API expectations.""" |
|
print("π‘ Testing API submission format...") |
|
|
|
|
|
sample_submission = { |
|
"task_id": "test_task_1", |
|
"model_answer": "42", |
|
"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..." |
|
} |
|
|
|
required_fields = ["task_id", "model_answer"] |
|
optional_fields = ["reasoning_trace"] |
|
|
|
print("Required fields check:") |
|
for field in required_fields: |
|
if field in sample_submission: |
|
print(f"β
{field}: {sample_submission[field]}") |
|
else: |
|
print(f"β Missing required field: {field}") |
|
|
|
print("Optional fields check:") |
|
for field in optional_fields: |
|
if field in sample_submission: |
|
print(f"β
{field}: Present ({len(str(sample_submission[field]))} chars)") |
|
else: |
|
print(f"βΉοΈ Optional field not present: {field}") |
|
|
|
if __name__ == "__main__": |
|
print("π§ GAIA Agent Format Testing") |
|
print("="*50) |
|
|
|
|
|
test_answer_formatting() |
|
|
|
|
|
test_simple_question() |
|
|
|
|
|
test_api_format() |
|
|
|
print("π Testing complete!") |
|
|