File size: 8,295 Bytes
f9a7c9b
 
a5c9e62
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
f9a7c9b
a5c9e62
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
f9a7c9b
a5c9e62
 
f9a7c9b
 
 
a5c9e62
 
f9a7c9b
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
a5c9e62
 
f9a7c9b
a5c9e62
 
 
 
 
 
f9a7c9b
 
a5c9e62
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
f9a7c9b
a5c9e62
 
 
 
 
f9a7c9b
 
a5c9e62
 
 
 
 
 
 
 
 
 
 
f9a7c9b
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
 
a5c9e62
 
 
 
 
 
 
 
 
 
 
f9a7c9b
 
a5c9e62
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#!/usr/bin/env python3
"""
Test script for validating agent performance on a random GAIA question.
Fetches one random question and tests the complete pipeline without submitting.
"""

import time
from utils import fetch_random_question, format_gaia_answer
from agent import smart_agent

def test_predefined_gaia_question():
    """Test the agent with a predefined GAIA question to verify web search and answer format."""
    
    print("πŸ§ͺ Testing predefined GAIA question (1928 Olympics)")
    print("="*60)
    
    # Predefined question that requires web search
    question = "What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer."
    task_id = "predefined_test"
    
    print(f"❓ Question: {question}")
    print()
    
    # Run the agent
    print("πŸ€– Running smart agent on the predefined question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"βœ… Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Display results
    print("πŸ“Š AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"πŸ“ Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:400] + "..." if len(reasoning_trace) > 400 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Validate answer format for GAIA
    print("βœ… GAIA FORMAT VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("βœ… Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer looks like IOC country code (2-3 uppercase letters)
    import re
    if re.match(r'^[A-Z]{2,3}$', answer.strip()):
        print(f"βœ… Answer '{answer}' matches IOC country code format")
    else:
        print(f"⚠️  Answer '{answer}' may not be in correct IOC format (should be 2-3 uppercase letters)")
    
    # Check if web search was used (look for web_search in reasoning)
    if "web_search" in reasoning_trace.lower() or "search" in reasoning_trace.lower():
        print("βœ… Agent appears to have used web search")
    else:
        print("⚠️  No clear evidence of web search usage")
    
    # Check answer length (should be short for country code)
    if len(answer.strip()) <= 5:
        print("βœ… Answer length is appropriate for country code")
    else:
        print("⚠️  Answer seems too long for a country code")
    
    print()
    
    # Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if answer and answer.strip() and len(answer.strip()) <= 5:
        print("βœ… PREDEFINED TEST PASSED - Answer format suitable for GAIA")
        print(f"🎯 Agent produced: '{answer}' for 1928 Olympics question")
        return True
    else:
        print("❌ PREDEFINED TEST FAILED - Answer format needs improvement")
        return False

def test_random_gaia_question():
    """Test the agent with a random GAIA question and validate the complete pipeline."""
    
    print("πŸ”§ GAIA Random Question Test")
    print("="*60)
    
    # Step 1: Fetch a random question
    print("πŸ“‘ Fetching random question from GAIA API...")
    try:
        question_data = fetch_random_question()
        if not question_data:
            print("❌ Failed to fetch random question")
            return False
        
        task_id = question_data.get("task_id", "unknown")
        question_text = question_data.get("question", "")
        
        if not question_text:
            print("❌ No question text in response")
            return False
            
        print(f"βœ… Successfully fetched question")
        print(f"πŸ“‹ Task ID: {task_id}")
        print(f"❓ Question: {question_text}")
        print()
        
    except Exception as e:
        print(f"❌ Error fetching question: {e}")
        return False
    
    # Step 2: Run the agent
    print("πŸ€– Running smart agent on the question...")
    try:
        start_time = time.time()
        answer, reasoning_trace = smart_agent(question_text, task_id)
        end_time = time.time()
        
        processing_time = end_time - start_time
        print(f"βœ… Agent completed in {processing_time:.2f} seconds")
        print()
        
    except Exception as e:
        print(f"❌ Error running agent: {e}")
        return False
    
    # Step 3: Display results
    print("πŸ“Š AGENT RESULTS")
    print("-" * 40)
    print(f"🎯 Formatted Answer: '{answer}'")
    print(f"πŸ“ Reasoning Length: {len(reasoning_trace)} characters")
    print(f"⏱️  Processing Time: {processing_time:.2f}s")
    print()
    
    # Step 4: Show reasoning trace preview
    print("🧠 REASONING TRACE PREVIEW")
    print("-" * 40)
    reasoning_preview = reasoning_trace[:300] + "..." if len(reasoning_trace) > 300 else reasoning_trace
    print(reasoning_preview)
    print()
    
    # Step 5: Validate answer format
    print("βœ… ANSWER VALIDATION")
    print("-" * 40)
    
    # Check if answer is not empty
    if answer and answer.strip():
        print("βœ… Answer is not empty")
    else:
        print("❌ Answer is empty or None")
        return False
    
    # Check if answer contains error messages
    if "ERROR" in answer.upper() or "FAILED" in answer.upper():
        print("⚠️  Answer contains error message")
    else:
        print("βœ… Answer appears to be valid (no error messages)")
    
    # Check answer length (reasonable bounds)
    if len(answer) > 1000:
        print("⚠️  Answer is very long (>1000 chars) - might need review")
    else:
        print("βœ… Answer length is reasonable")
    
    print()
    
    # Step 6: Show submission format
    print("πŸ“‘ SUBMISSION FORMAT PREVIEW")
    print("-" * 40)
    
    submission_entry = {
        "task_id": task_id,
        "model_answer": answer,
        "reasoning_trace": reasoning_trace
    }
    
    # Validate required fields
    required_fields = ["task_id", "model_answer"]
    all_valid = True
    
    for field in required_fields:
        if field in submission_entry and submission_entry[field]:
            print(f"βœ… {field}: '{submission_entry[field][:50]}{'...' if len(str(submission_entry[field])) > 50 else ''}'")
        else:
            print(f"❌ Missing or empty {field}")
            all_valid = False
    
    # Check optional fields
    if "reasoning_trace" in submission_entry and submission_entry["reasoning_trace"]:
        print(f"βœ… reasoning_trace: Present ({len(submission_entry['reasoning_trace'])} chars)")
    else:
        print("ℹ️  reasoning_trace: Not present (optional)")
    
    print()
    
    # Step 7: Final validation
    print("🏁 FINAL VALIDATION")
    print("-" * 40)
    
    if all_valid and answer and answer.strip():
        print("βœ… ALL CHECKS PASSED - Agent is ready for submission!")
        print("πŸš€ You can now run the full evaluation with confidence.")
        return True
    else:
        print("❌ SOME CHECKS FAILED - Please review the issues above.")
        return False

if __name__ == "__main__":
    print("πŸ§ͺ Testing agent with predefined GAIA question...")
    print("This test validates web search functionality and answer formatting.")
    print()
    
    # Test the predefined 1928 Olympics question
    success = test_predefined_gaia_question()
    
    print("\n" + "="*60)
    if success:
        print("πŸŽ‰ Predefined test completed successfully! Agent produces well-defined answers.")
        print("πŸ’‘ You can also run test_random_gaia_question() for additional testing.")
    else:
        print("⚠️  Predefined test revealed issues that need to be addressed.")
    print("="*60)