Final_Assignment_Template

Sleeping

File size: 8,295 Bytes

f9a7c9b
 
a5c9e62
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
f9a7c9b
a5c9e62
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
f9a7c9b
a5c9e62
 
f9a7c9b
 
 
a5c9e62
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
f9a7c9b
 
a5c9e62
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
f9a7c9b
a5c9e62
 
 
 
 
f9a7c9b
 
a5c9e62
 
 
 
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
 
a5c9e62
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
a5c9e62

#!/usr/bin/env python3
"""
Test script for validating agent performance on a random GAIA question.
Fetches one random question and tests the complete pipeline without submitting.
"""

import time
from utils import fetch_random_question, format_gaia_answer
from agent import smart_agent

def test_predefined_gaia_question():
    """Test the agent with a predefined GAIA question to verify web search and answer format."""
    
    print("🧪 Testing predefined GAIA question (1928 Olympics)")
    print("="*60)
    
    # Predefined question that requires web search
    question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
    task_id = "predefined_test"
    
    print(f"❓ Question: {question}")
    print()
    
    # Run the agent
    print("🤖 Running smart agent on the predefined question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"✅ Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Display results
    print("📊 AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Validate answer format for GAIA
    print("✅ GAIA FORMAT VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("✅ Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer looks like IOC country code (2-3 uppercase letters)
    import re
    if re.match(r'^[A-Z]{2,3}$', answer.strip()):
        print(f"✅ Answer '{answer}' matches IOC country code format")
    else:
        print(f"⚠️  Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
    
    # Check if web search was used (look for web_search in reasoning)
    if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
        print("✅ Agent appears to have used web search")
    else:
        print("⚠️  No clear evidence of web search usage")
    
    # Check answer length (should be short for country code)
    if len(answer.strip()) <= 5:
        print("✅ Answer length is appropriate for country code")
    else:
        print("⚠️  Answer seems too long for a country code")
    
    print()
    
    # Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if answer and answer.strip() and len(answer.strip()) <= 5:
        print("✅ PREDEFINED TEST PASSED - Answer format suitable for GAIA")
        print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
        return True
    else:
        print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
        return False

def test_random_gaia_question():
    """Test the agent with a random GAIA question and validate the complete pipeline."""
    
    print("🔧 GAIA Random Question Test")
    print("="*60)
    
    # Step 1: Fetch a random question
    print("📡 Fetching random question from GAIA API...")
    try:
        question_data = fetch_random_question()
        if not question_data:
            print("❌ Failed to fetch random question")
            return False
        
        task_id = question_data.get("task_id", "unknown")
        question_text = question_data.get("question", "")
        
        if not question_text:
            print("❌ No question text in response")
            return False
            
        print(f"✅ Successfully fetched question")
        print(f"📋 Task ID: {task_id}")
        print(f"❓ Question: {question_text}")
        print()
        
    except Exception as e:
        print(f"❌ Error fetching question: {e}")
        return False
    
    # Step 2: Run the agent
    print("🤖 Running smart agent on the question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question_text, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"✅ Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Step 3: Display results
    print("📊 AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"📝 Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Step 4: Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Step 5: Validate answer format
    print("✅ ANSWER VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("✅ Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer contains error messages
    if "ERROR" in answer.upper() or "FAILED" in answer.upper():
        print("⚠️  Answer contains error message")
    else:
        print("✅ Answer appears to be valid (no error messages)")
    
    # Check answer length (reasonable bounds)
    if len(answer) > 1000:
        print("⚠️  Answer is very long (>1000 chars) - might need review")
    else:
        print("✅ Answer length is reasonable")
    
    print()
    
    # Step 6: Show submission format
    print("📡 SUBMISSION FORMAT PREVIEW")
    print("-" * 40)
    
    submission_entry = {
        "task_id": task_id,
        "model_answer": answer,
        "reasoning_trace": reasoning_trace
    }
    
    # Validate required fields
    required_fields = ["task_id", "model_answer"]
    all_valid = True
    
    for field in required_fields:
        if field in submission_entry and submission_entry[field]:
            print(f"✅ {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
        else:
            print(f"❌ Missing or empty {field}")
            all_valid = False
    
    # Check optional fields
    if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
        print(f"✅ reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
    else:
        print("ℹ️  reasoning_trace: Not present (optional)")
    
    print()
    
    # Step 7: Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if all_valid and answer and answer.strip():
        print("✅ ALL CHECKS PASSED - Agent is ready for submission!")
        print("🚀 You can now run the full evaluation with confidence.")
        return True
    else:
        print("❌ SOME CHECKS FAILED - Please review the issues above.")
        return False

if __name__ == "__main__":
    print("🧪 Testing agent with predefined GAIA question...")
    print("This test validates web search functionality and answer formatting.")
    print()
    
    # Test the predefined 1928 Olympics question
    success = test_predefined_gaia_question()
    
    print("\n" + "="*60)
    if success:
        print("🎉 Predefined test completed successfully! Agent produces well-defined answers.")
        print("💡 You can also run test_random_gaia_question() for additional testing.")
    else:
        print("⚠️  Predefined test revealed issues that need to be addressed.")
    print("="*60)