File size: 8,295 Bytes
f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 f9a7c9b a5c9e62 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
#!/usr/bin/env python3
"""
Test script for validating agent performance on a random GAIA question.
Fetches one random question and tests the complete pipeline without submitting.
"""
import time
from utils import fetch_random_question, format_gaia_answer
from agent import smart_agent
def test_predefined_gaia_question():
"""Test the agent with a predefined GAIA question to verify web search and answer format."""
print("π§ͺ Testing predefined GAIA question (1928 Olympics)")
print("="*60)
# Predefined question that requires web search
question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
task_id = "predefined_test"
print(f"β Question: {question}")
print()
# Run the agent
print("π€ Running smart agent on the predefined question...")
try:
start_time = time.time()
answer, reasoning_trace = smart_agent(question, task_id)
end_time = time.time()
processing_time = end_time - start_time
print(f"β
Agent completed in {processing_time:.2f} seconds")
print()
except Exception as e:
print(f"β Error running agent: {e}")
return False
# Display results
print("π AGENT RESULTS")
print("-" * 40)
print(f"π― Formatted Answer: '{answer}'")
print(f"π Reasoning Length: {len(reasoning_trace)} characters")
print(f"β±οΈ Processing Time: {processing_time:.2f}s")
print()
# Show reasoning trace preview
print("π§ REASONING TRACE PREVIEW")
print("-" * 40)
reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
print(reasoning_preview)
print()
# Validate answer format for GAIA
print("β
GAIA FORMAT VALIDATION")
print("-" * 40)
# Check if answer is not empty
if answer and answer.strip():
print("β
Answer is not empty")
else:
print("β Answer is empty or None")
return False
# Check if answer looks like IOC country code (2-3 uppercase letters)
import re
if re.match(r'^[A-Z]{2,3}$', answer.strip()):
print(f"β
Answer '{answer}' matches IOC country code format")
else:
print(f"β οΈ Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
# Check if web search was used (look for web_search in reasoning)
if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
print("β
Agent appears to have used web search")
else:
print("β οΈ No clear evidence of web search usage")
# Check answer length (should be short for country code)
if len(answer.strip()) <= 5:
print("β
Answer length is appropriate for country code")
else:
print("β οΈ Answer seems too long for a country code")
print()
# Final validation
print("π FINAL VALIDATION")
print("-" * 40)
if answer and answer.strip() and len(answer.strip()) <= 5:
print("β
PREDEFINED TEST PASSED - Answer format suitable for GAIA")
print(f"π― Agent produced: '{answer}' for 1928 Olympics question")
return True
else:
print("β PREDEFINED TEST FAILED - Answer format needs improvement")
return False
def test_random_gaia_question():
"""Test the agent with a random GAIA question and validate the complete pipeline."""
print("π§ GAIA Random Question Test")
print("="*60)
# Step 1: Fetch a random question
print("π‘ Fetching random question from GAIA API...")
try:
question_data = fetch_random_question()
if not question_data:
print("β Failed to fetch random question")
return False
task_id = question_data.get("task_id", "unknown")
question_text = question_data.get("question", "")
if not question_text:
print("β No question text in response")
return False
print(f"β
Successfully fetched question")
print(f"π Task ID: {task_id}")
print(f"β Question: {question_text}")
print()
except Exception as e:
print(f"β Error fetching question: {e}")
return False
# Step 2: Run the agent
print("π€ Running smart agent on the question...")
try:
start_time = time.time()
answer, reasoning_trace = smart_agent(question_text, task_id)
end_time = time.time()
processing_time = end_time - start_time
print(f"β
Agent completed in {processing_time:.2f} seconds")
print()
except Exception as e:
print(f"β Error running agent: {e}")
return False
# Step 3: Display results
print("π AGENT RESULTS")
print("-" * 40)
print(f"π― Formatted Answer: '{answer}'")
print(f"π Reasoning Length: {len(reasoning_trace)} characters")
print(f"β±οΈ Processing Time: {processing_time:.2f}s")
print()
# Step 4: Show reasoning trace preview
print("π§ REASONING TRACE PREVIEW")
print("-" * 40)
reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
print(reasoning_preview)
print()
# Step 5: Validate answer format
print("β
ANSWER VALIDATION")
print("-" * 40)
# Check if answer is not empty
if answer and answer.strip():
print("β
Answer is not empty")
else:
print("β Answer is empty or None")
return False
# Check if answer contains error messages
if "ERROR" in answer.upper() or "FAILED" in answer.upper():
print("β οΈ Answer contains error message")
else:
print("β
Answer appears to be valid (no error messages)")
# Check answer length (reasonable bounds)
if len(answer) > 1000:
print("β οΈ Answer is very long (>1000 chars) - might need review")
else:
print("β
Answer length is reasonable")
print()
# Step 6: Show submission format
print("π‘ SUBMISSION FORMAT PREVIEW")
print("-" * 40)
submission_entry = {
"task_id": task_id,
"model_answer": answer,
"reasoning_trace": reasoning_trace
}
# Validate required fields
required_fields = ["task_id", "model_answer"]
all_valid = True
for field in required_fields:
if field in submission_entry and submission_entry[field]:
print(f"β
{field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
else:
print(f"β Missing or empty {field}")
all_valid = False
# Check optional fields
if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
print(f"β
reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
else:
print("βΉοΈ reasoning_trace: Not present (optional)")
print()
# Step 7: Final validation
print("π FINAL VALIDATION")
print("-" * 40)
if all_valid and answer and answer.strip():
print("β
ALL CHECKS PASSED - Agent is ready for submission!")
print("π You can now run the full evaluation with confidence.")
return True
else:
print("β SOME CHECKS FAILED - Please review the issues above.")
return False
if __name__ == "__main__":
print("π§ͺ Testing agent with predefined GAIA question...")
print("This test validates web search functionality and answer formatting.")
print()
# Test the predefined 1928 Olympics question
success = test_predefined_gaia_question()
print("\n" + "="*60)
if success:
print("π Predefined test completed successfully! Agent produces well-defined answers.")
print("π‘ You can also run test_random_gaia_question() for additional testing.")
else:
print("β οΈ Predefined test revealed issues that need to be addressed.")
print("="*60)
|