Ashokdll commited on
Commit
47415c5
·
verified ·
1 Parent(s): 3a8f426

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +845 -0
app.py ADDED
@@ -0,0 +1,845 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import re
4
+ import ast
5
+ import operator
6
+ from typing import Dict, List, Any, Optional
7
+ import time
8
+
9
+ class GAIAAgent:
10
+ def __init__(self, api_base_url: str):
11
+ self.api_base_url = api_base_url
12
+ self.tools = self._initialize_tools()
13
+ self.max_retries = 3
14
+ self.timeout = 30
15
+
16
+ def _initialize_tools(self):
17
+ """Initialize all available tools"""
18
+ return {
19
+ 'web_search': WebSearchTool(),
20
+ 'calculator': CalculatorTool(),
21
+ 'file_processor': FileProcessorTool(self.api_base_url),
22
+ 'text_analyzer': TextAnalyzerTool()
23
+ }
24
+
25
+ def solve_question(self, question_data: Dict) -> str:
26
+ """Main pipeline to solve a GAIA question"""
27
+ try:
28
+ # Step 1: Analyze the question
29
+ analysis = self._analyze_question(question_data)
30
+ print(f"Question analysis: {analysis}")
31
+
32
+ # Step 2: Create execution plan
33
+ plan = self._create_execution_plan(analysis, question_data)
34
+ print(f"Execution plan: {[step['action'] for step in plan]}")
35
+
36
+ # Step 3: Execute plan
37
+ results = self._execute_plan(plan, question_data)
38
+ print(f"Execution results keys: {list(results.keys())}")
39
+
40
+ # Step 4: Generate final answer
41
+ final_answer = self._generate_final_answer(results, question_data)
42
+
43
+ # Step 5: Format and validate answer
44
+ formatted_answer = self._format_final_answer(final_answer)
45
+
46
+ return formatted_answer
47
+
48
+ except Exception as e:
49
+ print(f"Error solving question: {e}")
50
+ return "Unable to determine answer"
51
+
52
+ def _analyze_question(self, question_data: Dict) -> Dict:
53
+ """Analyze question to determine approach and required tools"""
54
+ question = question_data.get('question', '')
55
+ has_file = bool(question_data.get('file_name'))
56
+
57
+ # Classify question type
58
+ question_lower = question.lower()
59
+
60
+ analysis = {
61
+ 'needs_calculation': any(word in question_lower for word in
62
+ ['calculate', 'compute', 'sum', 'total', 'average', 'count', 'multiply', 'divide']),
63
+ 'needs_web_search': any(word in question_lower for word in
64
+ ['who', 'what', 'when', 'where', 'find', 'search', 'latest', 'current']),
65
+ 'needs_file_processing': has_file,
66
+ 'is_factual_question': any(word in question_lower for word in
67
+ ['who is', 'what is', 'when was', 'where is']),
68
+ 'needs_analysis': any(word in question_lower for word in
69
+ ['analyze', 'compare', 'determine', 'evaluate']),
70
+ 'question_text': question,
71
+ 'has_file': has_file,
72
+ 'file_name': question_data.get('file_name', '')
73
+ }
74
+
75
+ return analysis
76
+
77
+ def _create_execution_plan(self, analysis: Dict, question_data: Dict) -> List[Dict]:
78
+ """Create step-by-step execution plan"""
79
+ plan = []
80
+
81
+ # Priority 1: Process files if they exist
82
+ if analysis['needs_file_processing']:
83
+ plan.append({
84
+ 'action': 'process_file',
85
+ 'tool': 'file_processor',
86
+ 'priority': 1,
87
+ 'params': {
88
+ 'task_id': question_data.get('task_id'),
89
+ 'file_name': question_data.get('file_name')
90
+ }
91
+ })
92
+
93
+ # Priority 2: Web search for factual information
94
+ if analysis['needs_web_search'] or analysis['is_factual_question']:
95
+ plan.append({
96
+ 'action': 'web_search',
97
+ 'tool': 'web_search',
98
+ 'priority': 2,
99
+ 'params': {
100
+ 'query': self._extract_search_query(analysis['question_text'])
101
+ }
102
+ })
103
+
104
+ # Priority 3: Calculations
105
+ if analysis['needs_calculation']:
106
+ plan.append({
107
+ 'action': 'calculate',
108
+ 'tool': 'calculator',
109
+ 'priority': 3,
110
+ 'params': {}
111
+ })
112
+
113
+ # Priority 4: Text analysis
114
+ plan.append({
115
+ 'action': 'analyze_text',
116
+ 'tool': 'text_analyzer',
117
+ 'priority': 4,
118
+ 'params': {
119
+ 'text': analysis['question_text']
120
+ }
121
+ })
122
+
123
+ return sorted(plan, key=lambda x: x['priority'])
124
+
125
+ def _execute_plan(self, plan: List[Dict], question_data: Dict) -> Dict:
126
+ """Execute the planned steps"""
127
+ results = {}
128
+
129
+ for step in plan:
130
+ tool_name = step['tool']
131
+ action = step['action']
132
+
133
+ try:
134
+ print(f"Executing: {action}")
135
+
136
+ if action == 'process_file':
137
+ results['file_data'] = self.tools[tool_name].process_file(
138
+ step['params']['task_id'],
139
+ step['params']['file_name']
140
+ )
141
+
142
+ elif action == 'web_search':
143
+ results['search_data'] = self.tools[tool_name].search(
144
+ step['params']['query']
145
+ )
146
+
147
+ elif action == 'calculate':
148
+ # Extract numbers and operations from question and file data
149
+ calculation_input = self._prepare_calculation_input(
150
+ question_data, results
151
+ )
152
+ if calculation_input:
153
+ results['calculation'] = self.tools[tool_name].calculate(
154
+ calculation_input
155
+ )
156
+
157
+ elif action == 'analyze_text':
158
+ results['text_analysis'] = self.tools[tool_name].analyze(
159
+ step['params']['text'],
160
+ context=results
161
+ )
162
+
163
+ except Exception as e:
164
+ print(f"Error in {action}: {e}")
165
+ results[f'{action}_error'] = str(e)
166
+
167
+ return results
168
+
169
+ def _extract_search_query(self, question: str) -> str:
170
+ """Extract relevant search query from question"""
171
+ # Remove question words and extract key terms
172
+ question_words = ['what', 'who', 'when', 'where', 'how', 'why', 'is', 'are', 'was', 'were']
173
+ words = question.lower().split()
174
+
175
+ # Keep important words, remove common question words
176
+ filtered_words = [word for word in words if word not in question_words and len(word) > 2]
177
+
178
+ return ' '.join(filtered_words[:6]) # Limit to 6 words
179
+
180
+ def _prepare_calculation_input(self, question_data: Dict, results: Dict) -> Optional[str]:
181
+ """Prepare input for calculator based on question and available data"""
182
+ question = question_data.get('question', '')
183
+
184
+ # Extract numbers from question
185
+ numbers = re.findall(r'\d+\.?\d*', question)
186
+
187
+ # Look for mathematical operations
188
+ if 'sum' in question.lower() or 'total' in question.lower():
189
+ if numbers:
190
+ return '+'.join(numbers)
191
+ elif 'multiply' in question.lower() or 'product' in question.lower():
192
+ if numbers:
193
+ return '*'.join(numbers)
194
+ elif 'average' in question.lower():
195
+ if numbers:
196
+ return f"({'+'.join(numbers)})/{len(numbers)}"
197
+
198
+ # Check if file data contains numbers for calculation
199
+ if 'file_data' in results and isinstance(results['file_data'], dict):
200
+ file_numbers = results['file_data'].get('numbers', [])
201
+ if file_numbers and ('sum' in question.lower() or 'total' in question.lower()):
202
+ return '+'.join(map(str, file_numbers))
203
+
204
+ return None
205
+
206
+ def _generate_final_answer(self, results: Dict, question_data: Dict) -> str:
207
+ """Generate final answer based on execution results"""
208
+ question = question_data.get('question', '').lower()
209
+
210
+ # Priority order for answer selection
211
+ if 'calculation' in results and results['calculation'] is not None:
212
+ return str(results['calculation'])
213
+
214
+ if 'file_data' in results and isinstance(results['file_data'], dict):
215
+ # Look for specific answer in file data
216
+ if 'answer' in results['file_data']:
217
+ return str(results['file_data']['answer'])
218
+ elif 'summary' in results['file_data']:
219
+ return str(results['file_data']['summary'])
220
+
221
+ if 'search_data' in results and results['search_data']:
222
+ # Extract answer from search results
223
+ for result in results['search_data']:
224
+ if isinstance(result, dict) and 'summary' in result:
225
+ return result['summary']
226
+
227
+ if 'text_analysis' in results:
228
+ return str(results['text_analysis'])
229
+
230
+ return "Unable to determine answer"
231
+
232
+ def _format_final_answer(self, answer: str) -> str:
233
+ """Format the final answer for exact match scoring"""
234
+ if not answer:
235
+ return "No answer found"
236
+
237
+ # Convert to string and strip whitespace
238
+ answer = str(answer).strip()
239
+
240
+ # Remove common prefixes that might cause exact match failures
241
+ prefixes_to_remove = [
242
+ 'the answer is: ',
243
+ 'answer: ',
244
+ 'final answer: ',
245
+ 'result: ',
246
+ 'solution: '
247
+ ]
248
+
249
+ answer_lower = answer.lower()
250
+ for prefix in prefixes_to_remove:
251
+ if answer_lower.startswith(prefix):
252
+ answer = answer[len(prefix):].strip()
253
+ break
254
+
255
+ # Handle numeric answers
256
+ if self._is_numeric_answer(answer):
257
+ return self._format_numeric_answer(answer)
258
+
259
+ # Handle yes/no answers
260
+ if answer.lower() in ['yes', 'no', 'true', 'false']:
261
+ return answer.lower()
262
+
263
+ # Return cleaned text answer
264
+ return answer
265
+
266
+ def _is_numeric_answer(self, answer: str) -> bool:
267
+ """Check if answer is numeric"""
268
+ try:
269
+ float(answer)
270
+ return True
271
+ except ValueError:
272
+ return False
273
+
274
+ def _format_numeric_answer(self, answer: str) -> str:
275
+ """Format numeric answers consistently"""
276
+ try:
277
+ num = float(answer)
278
+ if num.is_integer():
279
+ return str(int(num))
280
+ else:
281
+ # Round to 6 decimal places to avoid floating point issues
282
+ return str(round(num, 6)).rstrip('0').rstrip('.')
283
+ except ValueError:
284
+ return answer
285
+
286
+
287
+ class WebSearchTool:
288
+ """Simple web search tool (implement with your preferred search API)"""
289
+
290
+ def search(self, query: str, max_results: int = 3) -> List[Dict]:
291
+ """Perform web search - implement with your preferred search service"""
292
+ print(f"Web search: {query}")
293
+
294
+ # Placeholder implementation
295
+ # Replace with actual search API (DuckDuckGo, Google Custom Search, etc.)
296
+ return [
297
+ {
298
+ 'title': f'Search result for: {query}',
299
+ 'summary': f'Information about {query}',
300
+ 'url': 'https://example.com'
301
+ }
302
+ ]
303
+
304
+
305
+ class CalculatorTool:
306
+ """Safe calculator for mathematical expressions"""
307
+
308
+ def calculate(self, expression: str) -> Optional[float]:
309
+ """Safely evaluate mathematical expressions"""
310
+ try:
311
+ # Remove whitespace
312
+ expression = expression.replace(' ', '')
313
+
314
+ # Basic safety check
315
+ allowed_chars = set('0123456789+-*/().e')
316
+ if not all(c in allowed_chars for c in expression):
317
+ raise ValueError("Invalid characters in expression")
318
+
319
+ # Use ast for safe evaluation
320
+ node = ast.parse(expression, mode='eval')
321
+ result = self._eval_node(node.body)
322
+
323
+ return result
324
+
325
+ except Exception as e:
326
+ print(f"Calculation error: {e}")
327
+ return None
328
+
329
+ def _eval_node(self, node):
330
+ """Recursively evaluate AST node"""
331
+ if isinstance(node, ast.Constant):
332
+ return node.value
333
+ elif isinstance(node, ast.Num): # Python < 3.8
334
+ return node.n
335
+ elif isinstance(node, ast.BinOp):
336
+ left = self._eval_node(node.left)
337
+ right = self._eval_node(node.right)
338
+
339
+ if isinstance(node.op, ast.Add):
340
+ return left + right
341
+ elif isinstance(node.op, ast.Sub):
342
+ return left - right
343
+ elif isinstance(node.op, ast.Mult):
344
+ return left * right
345
+ elif isinstance(node.op, ast.Div):
346
+ return left / right
347
+ elif isinstance(node.op, ast.Pow):
348
+ return left ** right
349
+ elif isinstance(node, ast.UnaryOp):
350
+ operand = self._eval_node(node.operand)
351
+ if isinstance(node.op, ast.USub):
352
+ return -operand
353
+ elif isinstance(node.op, ast.UAdd):
354
+ return +operand
355
+
356
+ raise ValueError(f"Unsupported operation: {type(node)}")
357
+
358
+
359
+ class FileProcessorTool:
360
+ """Tool for processing files from GAIA tasks"""
361
+
362
+ def __init__(self, api_base_url: str):
363
+ self.api_base_url = api_base_url
364
+
365
+ def process_file(self, task_id: str, file_name: str) -> Dict:
366
+ """Process file associated with a task"""
367
+ try:
368
+ # Download file
369
+ file_content = self._download_file(task_id)
370
+
371
+ # Process based on file extension
372
+ if file_name.endswith('.csv'):
373
+ return self._process_csv(file_content)
374
+ elif file_name.endswith('.txt'):
375
+ return self._process_text(file_content)
376
+ elif file_name.endswith('.json'):
377
+ return self._process_json(file_content)
378
+ else:
379
+ return self._process_generic(file_content)
380
+
381
+ except Exception as e:
382
+ print(f"File processing error: {e}")
383
+ return {'error': str(e)}
384
+
385
+ def _download_file(self, task_id: str) -> bytes:
386
+ """Download file from API"""
387
+ response = requests.get(f"{self.api_base_url}/files/{task_id}")
388
+ response.raise_for_status()
389
+ return response.content
390
+
391
+ def _process_csv(self, content: bytes) -> Dict:
392
+ """Process CSV file"""
393
+ try:
394
+ import io
395
+ import csv
396
+
397
+ # Convert bytes to string
398
+ text_content = content.decode('utf-8')
399
+
400
+ # Parse CSV
401
+ reader = csv.reader(io.StringIO(text_content))
402
+ rows = list(reader)
403
+
404
+ if not rows:
405
+ return {'error': 'Empty CSV file'}
406
+
407
+ headers = rows[0] if rows else []
408
+ data_rows = rows[1:] if len(rows) > 1 else []
409
+
410
+ # Extract numbers for potential calculations
411
+ numbers = []
412
+ for row in data_rows:
413
+ for cell in row:
414
+ try:
415
+ numbers.append(float(cell))
416
+ except ValueError:
417
+ continue
418
+
419
+ return {
420
+ 'type': 'csv',
421
+ 'headers': headers,
422
+ 'rows': data_rows,
423
+ 'row_count': len(data_rows),
424
+ 'numbers': numbers,
425
+ 'summary': f'CSV with {len(headers)} columns and {len(data_rows)} rows'
426
+ }
427
+
428
+ except Exception as e:
429
+ return {'error': f'CSV processing failed: {e}'}
430
+
431
+ def _process_text(self, content: bytes) -> Dict:
432
+ """Process text file"""
433
+ try:
434
+ text = content.decode('utf-8')
435
+
436
+ # Extract numbers from text
437
+ numbers = [float(match) for match in re.findall(r'\d+\.?\d*', text)]
438
+
439
+ # Basic text analysis
440
+ lines = text.split('\n')
441
+ words = text.split()
442
+
443
+ return {
444
+ 'type': 'text',
445
+ 'content': text,
446
+ 'line_count': len(lines),
447
+ 'word_count': len(words),
448
+ 'numbers': numbers,
449
+ 'summary': f'Text file with {len(lines)} lines and {len(words)} words'
450
+ }
451
+
452
+ except Exception as e:
453
+ return {'error': f'Text processing failed: {e}'}
454
+
455
+ def _process_json(self, content: bytes) -> Dict:
456
+ """Process JSON file"""
457
+ try:
458
+ data = json.loads(content.decode('utf-8'))
459
+
460
+ # Extract numbers from JSON structure
461
+ numbers = self._extract_numbers_from_json(data)
462
+
463
+ return {
464
+ 'type': 'json',
465
+ 'data': data,
466
+ 'numbers': numbers,
467
+ 'summary': f'JSON file with {len(data) if isinstance(data, (list, dict)) else 1} items'
468
+ }
469
+
470
+ except Exception as e:
471
+ return {'error': f'JSON processing failed: {e}'}
472
+
473
+ def _process_generic(self, content: bytes) -> Dict:
474
+ """Process generic file"""
475
+ try:
476
+ # Try to decode as text first
477
+ try:
478
+ text = content.decode('utf-8')
479
+ return self._process_text(content)
480
+ except UnicodeDecodeError:
481
+ # Binary file
482
+ return {
483
+ 'type': 'binary',
484
+ 'size': len(content),
485
+ 'summary': f'Binary file of {len(content)} bytes'
486
+ }
487
+
488
+ except Exception as e:
489
+ return {'error': f'Generic processing failed: {e}'}
490
+
491
+ def _extract_numbers_from_json(self, data, numbers=None):
492
+ """Recursively extract numbers from JSON structure"""
493
+ if numbers is None:
494
+ numbers = []
495
+
496
+ if isinstance(data, (int, float)):
497
+ numbers.append(float(data))
498
+ elif isinstance(data, dict):
499
+ for value in data.values():
500
+ self._extract_numbers_from_json(value, numbers)
501
+ elif isinstance(data, list):
502
+ for item in data:
503
+ self._extract_numbers_from_json(item, numbers)
504
+
505
+ return numbers
506
+
507
+
508
+ class TextAnalyzerTool:
509
+ """Tool for analyzing and extracting information from text"""
510
+
511
+ def analyze(self, text: str, context: Dict = None) -> str:
512
+ """Analyze text and extract relevant information"""
513
+ try:
514
+ # Basic keyword extraction
515
+ keywords = self._extract_keywords(text)
516
+
517
+ # Look for specific patterns based on question type
518
+ if any(word in text.lower() for word in ['who', 'what', 'when', 'where']):
519
+ return self._analyze_question_pattern(text, context)
520
+
521
+ # Look for calculations
522
+ if any(word in text.lower() for word in ['calculate', 'sum', 'total', 'average']):
523
+ return self._analyze_calculation_pattern(text, context)
524
+
525
+ # Default analysis
526
+ return f"Analysis of text with keywords: {', '.join(keywords[:5])}"
527
+
528
+ except Exception as e:
529
+ return f"Analysis failed: {e}"
530
+
531
+ def _extract_keywords(self, text: str) -> List[str]:
532
+ """Extract important keywords from text"""
533
+ # Simple keyword extraction
534
+ words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower())
535
+
536
+ # Remove common stop words
537
+ stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'its', 'let', 'put', 'say', 'she', 'too', 'use'}
538
+
539
+ keywords = [word for word in words if word not in stop_words]
540
+
541
+ # Return most frequent keywords
542
+ from collections import Counter
543
+ return [word for word, count in Counter(keywords).most_common(10)]
544
+
545
+ def _analyze_question_pattern(self, text: str, context: Dict) -> str:
546
+ """Analyze question patterns to extract answers"""
547
+ # This is where you'd implement more sophisticated NLP
548
+ # For now, return a simple analysis
549
+
550
+ if context and 'search_data' in context:
551
+ search_results = context['search_data']
552
+ if search_results and isinstance(search_results, list) and len(search_results) > 0:
553
+ return search_results[0].get('summary', 'No summary available')
554
+
555
+ return "Unable to extract specific answer from question pattern"
556
+
557
+ def _analyze_calculation_pattern(self, text: str, context: Dict) -> str:
558
+ """Analyze calculation patterns"""
559
+ if context and 'calculation' in context:
560
+ return str(context['calculation'])
561
+
562
+ # Extract numbers for potential calculation
563
+ numbers = re.findall(r'\d+\.?\d*', text)
564
+ if numbers:
565
+ return f"Found numbers: {', '.join(numbers)}"
566
+
567
+ return "No calculation pattern found"
568
+
569
+
570
+ # Main execution functions
571
+ def test_agent_on_random_question(api_base_url: str):
572
+ """Test the agent on a random question"""
573
+ agent = GAIAAgent(api_base_url)
574
+
575
+ try:
576
+ # Get random question
577
+ response = requests.get(f"{api_base_url}/random-question")
578
+ question = response.json()
579
+
580
+ print("=" * 50)
581
+ print("TESTING RANDOM QUESTION")
582
+ print("=" * 50)
583
+ print(f"Task ID: {question.get('task_id')}")
584
+ print(f"Question: {question.get('question')}")
585
+ print(f"File: {question.get('file_name', 'None')}")
586
+ print("-" * 50)
587
+
588
+ # Solve question
589
+ start_time = time.time()
590
+ answer = agent.solve_question(question)
591
+ end_time = time.time()
592
+
593
+ print(f"Agent Answer: {answer}")
594
+ print(f"Processing Time: {end_time - start_time:.2f} seconds")
595
+ print("=" * 50)
596
+
597
+ return {
598
+ 'task_id': question.get('task_id'),
599
+ 'question': question.get('question'),
600
+ 'agent_answer': answer,
601
+ 'processing_time': end_time - start_time
602
+ }
603
+
604
+ except Exception as e:
605
+ print(f"Error testing random question: {e}")
606
+ return None
607
+
608
+
609
+ def run_full_evaluation(api_base_url: str, username: str, agent_code_url: str):
610
+ """Run the complete evaluation on all 20 questions"""
611
+ agent = GAIAAgent(api_base_url)
612
+
613
+ try:
614
+ # Get all questions
615
+ response = requests.get(f"{api_base_url}/questions")
616
+ questions = response.json()
617
+
618
+ print(f"Starting evaluation on {len(questions)} questions...")
619
+
620
+ answers = []
621
+ successful_answers = 0
622
+
623
+ for i, question in enumerate(questions):
624
+ print(f"\n{'='*60}")
625
+ print(f"PROCESSING QUESTION {i+1}/{len(questions)}")
626
+ print(f"{'='*60}")
627
+ print(f"Task ID: {question.get('task_id')}")
628
+ print(f"Question: {question.get('question')[:100]}...")
629
+
630
+ try:
631
+ start_time = time.time()
632
+ answer = agent.solve_question(question)
633
+ end_time = time.time()
634
+
635
+ answers.append({
636
+ 'task_id': question['task_id'],
637
+ 'submitted_answer': answer
638
+ })
639
+
640
+ print(f"Answer: {answer}")
641
+ print(f"Time: {end_time - start_time:.2f}s")
642
+
643
+ if answer and answer != "Unable to determine answer":
644
+ successful_answers += 1
645
+
646
+ except Exception as e:
647
+ print(f"Error processing question {i+1}: {e}")
648
+ answers.append({
649
+ 'task_id': question['task_id'],
650
+ 'submitted_answer': "Processing error"
651
+ })
652
+
653
+ print(f"\n{'='*60}")
654
+ print(f"EVALUATION COMPLETE")
655
+ print(f"{'='*60}")
656
+ print(f"Successfully processed: {successful_answers}/{len(questions)} questions")
657
+ print(f"Success rate: {(successful_answers/len(questions)*100):.1f}%")
658
+
659
+ # Submit results
660
+ print(f"\nSubmitting results...")
661
+ submission_result = submit_results(api_base_url, username, agent_code_url, answers)
662
+
663
+ return {
664
+ 'answers': answers,
665
+ 'successful_answers': successful_answers,
666
+ 'total_questions': len(questions),
667
+ 'submission_result': submission_result
668
+ }
669
+
670
+ except Exception as e:
671
+ print(f"Error in full evaluation: {e}")
672
+ return None
673
+
674
+
675
+ def submit_results(api_base_url: str, username: str, agent_code_url: str, answers: List[Dict]):
676
+ """Submit results to the leaderboard"""
677
+ try:
678
+ submission_data = {
679
+ 'username': username,
680
+ 'agent_code': agent_code_url,
681
+ 'answers': answers
682
+ }
683
+
684
+ response = requests.post(f"{api_base_url}/submit", json=submission_data)
685
+
686
+ if response.status_code == 200:
687
+ result = response.json()
688
+ print(f"✅ Submission successful!")
689
+ print(f"Score: {result.get('score', 'N/A')}%")
690
+ print(f"Rank: {result.get('rank', 'N/A')}")
691
+ return result
692
+ else:
693
+ print(f"❌ Submission failed: {response.status_code}")
694
+ print(f"Response: {response.text}")
695
+ return None
696
+
697
+ except Exception as e:
698
+ print(f"Error submitting results: {e}")
699
+ return None
700
+
701
+
702
+ # Example usage and testing functions
703
+ if __name__ == "__main__":
704
+ # Configuration - Replace with actual values
705
+ API_BASE_URL = "https://your-api-endpoint.com" # Replace with actual API URL
706
+ USERNAME = "your-huggingface-username" # Replace with your username
707
+ AGENT_CODE_URL = "https://huggingface.co/spaces/your-username/gaia-agent/tree/main" # Replace with your space URL
708
+
709
+ print("GAIA Agent Implementation")
710
+ print("=" * 40)
711
+
712
+ # Test on a few random questions first
713
+ print("1. Testing on random questions...")
714
+ for i in range(3):
715
+ print(f"\n--- Random Test {i+1} ---")
716
+ test_result = test_agent_on_random_question(API_BASE_URL)
717
+ if test_result:
718
+ print(f"✅ Test {i+1} completed")
719
+ else:
720
+ print(f"❌ Test {i+1} failed")
721
+
722
+ # Ask user if they want to run full evaluation
723
+ user_input = input("\nRun full evaluation on all 20 questions? (y/n): ")
724
+
725
+ if user_input.lower() == 'y':
726
+ print("\n" + "=" * 60)
727
+ print("STARTING FULL EVALUATION")
728
+ print("=" * 60)
729
+
730
+ evaluation_result = run_full_evaluation(API_BASE_URL, USERNAME, AGENT_CODE_URL)
731
+
732
+ if evaluation_result:
733
+ print(f"\n🎉 Evaluation completed!")
734
+ print(f"Final score: {evaluation_result.get('submission_result', {}).get('score', 'N/A')}%")
735
+
736
+ if evaluation_result.get('submission_result', {}).get('score', 0) >= 30:
737
+ print(f"🏆 CONGRATULATIONS! You've achieved the 30% threshold!")
738
+ print(f"🎓 You've earned your Certificate of Completion!")
739
+ else:
740
+ print(f"📈 Keep improving! You need 30% to earn the certificate.")
741
+ else:
742
+ print(f"❌ Evaluation failed. Please check your implementation.")
743
+
744
+ else:
745
+ print("Evaluation cancelled. Use the test functions to debug your agent first.")
746
+
747
+
748
+ # Additional utility functions for development and debugging
749
+
750
+ def debug_question_analysis(api_base_url: str, task_id: str = None):
751
+ """Debug question analysis for a specific question"""
752
+ agent = GAIAAgent(api_base_url)
753
+
754
+ if task_id:
755
+ # Get specific question (you'd need to implement this endpoint or find the question in the list)
756
+ response = requests.get(f"{api_base_url}/questions")
757
+ questions = response.json()
758
+ question = next((q for q in questions if q.get('task_id') == task_id), None)
759
+ else:
760
+ # Get random question
761
+ response = requests.get(f"{api_base_url}/random-question")
762
+ question = response.json()
763
+
764
+ if not question:
765
+ print("Question not found")
766
+ return
767
+
768
+ print("QUESTION ANALYSIS DEBUG")
769
+ print("=" * 40)
770
+ print(f"Task ID: {question.get('task_id')}")
771
+ print(f"Question: {question.get('question')}")
772
+ print(f"File: {question.get('file_name', 'None')}")
773
+ print("-" * 40)
774
+
775
+ # Analyze question
776
+ analysis = agent._analyze_question(question)
777
+ print("Analysis Results:")
778
+ for key, value in analysis.items():
779
+ print(f" {key}: {value}")
780
+
781
+ # Create plan
782
+ plan = agent._create_execution_plan(analysis, question)
783
+ print(f"\nExecution Plan:")
784
+ for i, step in enumerate(plan):
785
+ print(f" {i+1}. {step['action']} (priority: {step['priority']})")
786
+
787
+ return question, analysis, plan
788
+
789
+
790
+ def benchmark_agent_performance(api_base_url: str, num_tests: int = 10):
791
+ """Benchmark agent performance on multiple random questions"""
792
+ agent = GAIAAgent(api_base_url)
793
+
794
+ results = []
795
+ total_time = 0
796
+ successful_answers = 0
797
+
798
+ print(f"BENCHMARKING AGENT ({num_tests} questions)")
799
+ print("=" * 50)
800
+
801
+ for i in range(num_tests):
802
+ try:
803
+ response = requests.get(f"{api_base_url}/random-question")
804
+ question = response.json()
805
+
806
+ start_time = time.time()
807
+ answer = agent.solve_question(question)
808
+ end_time = time.time()
809
+
810
+ processing_time = end_time - start_time
811
+ total_time += processing_time
812
+
813
+ if answer and answer != "Unable to determine answer":
814
+ successful_answers += 1
815
+ status = "✅"
816
+ else:
817
+ status = "❌"
818
+
819
+ print(f"{status} Question {i+1}: {processing_time:.2f}s - {answer[:50]}...")
820
+
821
+ results.append({
822
+ 'question_id': i+1,
823
+ 'task_id': question.get('task_id'),
824
+ 'answer': answer,
825
+ 'processing_time': processing_time,
826
+ 'success': answer != "Unable to determine answer"
827
+ })
828
+
829
+ except Exception as e:
830
+ print(f"❌ Question {i+1}: Error - {e}")
831
+ results.append({
832
+ 'question_id': i+1,
833
+ 'error': str(e),
834
+ 'success': False
835
+ })
836
+
837
+ # Print summary
838
+ print("\n" + "=" * 50)
839
+ print("BENCHMARK RESULTS")
840
+ print("=" * 50)
841
+ print(f"Successful answers: {successful_answers}/{num_tests} ({successful_answers/num_tests*100:.1f}%)")
842
+ print(f"Average processing time: {total_time/num_tests:.2f}s")
843
+ print(f"Total time: {total_time:.2f}s")
844
+
845
+ return results