Final_Assignment_Template

Sleeping

Final_Assignment_Template / test_agent_format.py

Denis Davydov

enhanced web search

a5c9e62 about 2 months ago

3.48 kB

	#!/usr/bin/env python3
	"""
	Test script to verify the agent's answer formatting works correctly.
	"""

	import os
	from agent import smart_agent
	from utils import format_gaia_answer

	def test_answer_formatting():
	"""Test the answer formatting function with various inputs."""

	test_cases = [
	# Test case: (raw_answer, expected_format)
	("I think the answer is 42. FINAL ANSWER: 42", "42"),
	("Let me calculate... FINAL ANSWER: 3.14159", "3.14159"),
	("After research, FINAL ANSWER: New York", "New York"),
	("The result is FINAL ANSWER: apple, banana, cherry", "apple, banana, cherry"),
	("FINAL ANSWER: 1,234", "1234"), # Should remove commas from numbers
	("FINAL ANSWER: \"Hello World\"", "Hello World"), # Should remove quotes
	("FINAL ANSWER: approximately 100", "100"), # Should remove qualifiers
	("No clear final answer format here", "No clear final answer format here"), # Fallback
	]

	print("🧪 Testing answer formatting...")
	for i, (raw, expected) in enumerate(test_cases, 1):
	result = format_gaia_answer(raw)
	status = "✅" if result == expected else "❌"
	print(f"{status} Test {i}: '{raw}' -> '{result}' (expected: '{expected}')")
	if result != expected:
	print(f" ⚠️ Mismatch detected!")

	print("\n" + "="*50)

	def test_simple_question():
	"""Test the agent with a simple question."""
	print("🤖 Testing agent with a simple question...")

	question = "What is 2 + 2?"
	try:
	answer, reasoning = smart_agent(question)
	print(f"Question: {question}")
	print(f"Answer: {answer}")
	print(f"Reasoning length: {len(reasoning)} characters")
	print(f"Raw reasoning preview: {reasoning[:200]}...")

	# Check if answer follows expected format
	if answer and answer.strip():
	print("✅ Agent returned a non-empty answer")
	else:
	print("❌ Agent returned empty answer")

	except Exception as e:
	print(f"❌ Error testing agent: {e}")

	print("\n" + "="*50)

	def test_api_format():
	"""Test that our submission format matches API expectations."""
	print("📡 Testing API submission format...")

	# Simulate what would be sent to the API
	sample_submission = {
	"task_id": "test_task_1",
	"model_answer": "42",
	"reasoning_trace": "I calculated 2+2 and got 4, but the question asks for something else..."
	}

	required_fields = ["task_id", "model_answer"]
	optional_fields = ["reasoning_trace"]

	print("Required fields check:")
	for field in required_fields:
	if field in sample_submission:
	print(f"✅ {field}: {sample_submission[field]}")
	else:
	print(f"❌ Missing required field: {field}")

	print("Optional fields check:")
	for field in optional_fields:
	if field in sample_submission:
	print(f"✅ {field}: Present ({len(str(sample_submission[field]))} chars)")
	else:
	print(f"ℹ️ Optional field not present: {field}")

	if __name__ == "__main__":
	print("🔧 GAIA Agent Format Testing")
	print("="*50)

	# Test 1: Answer formatting
	test_answer_formatting()

	# Test 2: Simple agent question
	test_simple_question()

	# Test 3: API format
	test_api_format()

	print("🏁 Testing complete!")