#!/usr/bin/env python3 """ Test script to verify the agent's answer formatting works correctly. """ import os from agent import smart_agent from utils import format_gaia_answer def test_answer_formatting(): """Test the answer formatting function with various inputs.""" test_cases = [ # Test case: (raw_answer, expected_format) ("I think the answer is 42. FINAL ANSWER: 42", "42"), ("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"), ("After research, FINAL ANSWER: New York", "New York"), ("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"), ("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers ("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes ("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers ("No clear final answer format here", "No clear final answer format here"), # Fallback ] print("๐Ÿงช Testing answer formatting...") for i, (raw, expected) in enumerate(test_cases, 1): result = format_gaia_answer(raw) status = "โœ…" if result == expected else "โŒ" print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')") if result != expected: print(f" โš ๏ธ Mismatch detected!") print("\n" + "="*50) def test_simple_question(): """Test the agent with a simple question.""" print("๐Ÿค– Testing agent with a simple question...") question = "What is 2 + 2?" try: answer, reasoning = smart_agent(question) print(f"Question: {question}") print(f"Answer: {answer}") print(f"Reasoning length: {len(reasoning)} characters") print(f"Raw reasoning preview: {reasoning[:200]}...") # Check if answer follows expected format if answer and answer.strip(): print("โœ… Agent returned a non-empty answer") else: print("โŒ Agent returned empty answer") except Exception as e: print(f"โŒ Error testing agent: {e}") print("\n" + "="*50) def test_api_format(): """Test that our submission format matches API expectations.""" print("๐Ÿ“ก Testing API submission format...") # Simulate what would be sent to the API sample_submission = { "task_id": "test_task_1", "model_answer": "42", "reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..." } required_fields = ["task_id", "model_answer"] optional_fields = ["reasoning_trace"] print("Required fields check:") for field in required_fields: if field in sample_submission: print(f"โœ… {field}: {sample_submission[field]}") else: print(f"โŒ Missing required field: {field}") print("Optional fields check:") for field in optional_fields: if field in sample_submission: print(f"โœ… {field}: Present ({len(str(sample_submission[field]))} chars)") else: print(f"โ„น๏ธ Optional field not present: {field}") if __name__ == "__main__": print("๐Ÿ”ง GAIA Agent Format Testing") print("="*50) # Test 1: Answer formatting test_answer_formatting() # Test 2: Simple agent question test_simple_question() # Test 3: API format test_api_format() print("๐Ÿ Testing complete!")