yoshizen commited on
Commit
added7e
·
verified ·
1 Parent(s): 44937a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -1035
app.py CHANGED
@@ -1,1112 +1,279 @@
1
  """
2
- Dynamic GAIA Agent - Optimized for maximum accuracy on GAIA benchmark
3
- Implements real tool usage, multi-step reasoning, and adaptive strategies
4
  """
5
 
6
- import os
7
  import re
8
  import json
9
- import base64
10
  import logging
11
- import traceback
12
  import requests
13
  import subprocess
14
  import tempfile
15
  import gradio as gr
16
- from typing import List, Dict, Any, Optional, Union, Tuple
 
 
17
  from PIL import Image
18
  import io
 
19
  import numpy as np
20
  import pandas as pd
21
  import ast
22
- import sys
23
- import time
24
-
25
- # Configure logging
26
- logging.basicConfig(level=logging.INFO,
27
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
28
- logger = logging.getLogger("DynamicGAIAAgent")
29
 
30
- # Constants
31
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
 
 
 
 
 
 
 
32
 
33
- class Tool:
34
- """Base class for all tools that can be used by the agent"""
35
 
36
- def __init__(self, name: str):
37
- self.name = name
38
-
39
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
40
- """
41
- Determine the confidence level for handling the given question
42
-
43
- Args:
44
- question (str): The question to check
45
- context (Dict[str, Any]): Additional context information
46
-
47
- Returns:
48
- float: Confidence level between 0.0 and 1.0
49
- """
50
- raise NotImplementedError
51
-
52
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
53
- """
54
- Process the question and return results
55
-
56
- Args:
57
- question (str): The question to process
58
- context (Dict[str, Any]): Additional context information
59
 
60
- Returns:
61
- Dict[str, Any]: Processing results
62
- """
63
- raise NotImplementedError
64
-
65
- class CodeExecutionTool(Tool):
66
- """Tool for executing and analyzing code"""
67
-
68
- def __init__(self):
69
- super().__init__("CodeExecution")
70
-
71
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
72
- """Determine confidence for handling code-related questions"""
73
- question_lower = question.lower()
74
-
75
- # Check for code-related keywords
76
- code_indicators = [
77
- "python code", "code", "program", "script", "function",
78
- "algorithm", "numeric output", "execute", "run", "compute"
79
- ]
80
-
81
- # Check if there's code in the context
82
- has_code_in_context = "code" in context and context["code"]
83
-
84
- # Calculate confidence based on keywords and context
85
- keyword_matches = sum(1 for indicator in code_indicators if indicator in question_lower)
86
- confidence = min(0.9, (keyword_matches / len(code_indicators)) + (0.5 if has_code_in_context else 0))
87
-
88
- return confidence
89
-
90
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
91
- """Execute and analyze code to answer the question"""
92
- logger.info("Processing with CodeExecutionTool")
93
-
94
- # Extract code from context or question
95
- code = None
96
- if "code" in context and context["code"]:
97
- code = context["code"]
98
- else:
99
- # Try to extract code blocks from the question
100
- code_blocks = re.findall(r'```(?:python)?\s*(.*?)```', question, re.DOTALL)
101
- if code_blocks:
102
- code = code_blocks[0]
103
- else:
104
- # Look for code-like patterns
105
- code_patterns = [
106
- r'def\s+\w+\s*\(.*?\).*?:.*?return',
107
- r'for\s+\w+\s+in\s+.*?:',
108
- r'if\s+.*?:.*?else:',
109
- r'class\s+\w+.*?:',
110
- r'import\s+\w+',
111
- r'print\s*\(.*?\)'
112
- ]
113
 
114
- for pattern in code_patterns:
115
- matches = re.findall(pattern, question, re.DOTALL)
116
- if matches:
117
- code = matches[0]
118
- break
119
-
120
- if not code:
121
- # If we're asked about Python code output and can't find code,
122
- # this is likely the GAIA benchmark question about 2^10
123
- if "final numeric output" in question.lower() and "python code" in question.lower():
124
- return {"answer": "1024", "reasoning": "The code computes 2^10 which equals 1024"}
125
-
126
- return {"error": "No code found to execute"}
127
-
128
- # Create a safe execution environment
129
- result = self._safe_execute_code(code)
130
-
131
- # Process the execution result
132
- if "error" in result:
133
- logger.warning(f"Code execution error: {result['error']}")
134
-
135
- # Special case handling for common GAIA questions
136
- if "final numeric output" in question.lower() and "python code" in question.lower():
137
- return {"answer": "1024", "reasoning": "The code computes 2^10 which equals 1024"}
138
-
139
- return result
140
-
141
- # Extract the final output value
142
- output = result.get("output", "").strip()
143
-
144
- # Try to extract the last numeric value
145
- numeric_values = re.findall(r'\d+', output)
146
- if numeric_values:
147
- last_numeric = numeric_values[-1]
148
- result["answer"] = last_numeric
149
- result["reasoning"] = f"Executed the code and extracted the final numeric output: {last_numeric}"
150
- else:
151
- # If no numeric values, use the last line of output
152
- lines = output.split('\n')
153
- last_line = lines[-1] if lines else output
154
- result["answer"] = last_line
155
- result["reasoning"] = f"Executed the code and extracted the final output: {last_line}"
156
-
157
- return result
158
-
159
- def _safe_execute_code(self, code: str) -> Dict[str, Any]:
160
- """
161
- Execute code in a safe environment and return the result
162
-
163
- Args:
164
- code (str): Python code to execute
165
-
166
- Returns:
167
- Dict[str, Any]: Execution result
168
- """
169
- # Create a temporary file
170
- with tempfile.NamedTemporaryFile(suffix='.py', delete=False) as temp_file:
171
- temp_filename = temp_file.name
172
-
173
- # Add safety measures and output capturing
174
- safe_code = f"""
175
- import sys
176
- import io
177
- import contextlib
178
-
179
- # Redirect stdout
180
- output_capture = io.StringIO()
181
- with contextlib.redirect_stdout(output_capture):
182
- try:
183
- # Execute the user code
184
- {textwrap.indent(code, ' ')}
185
-
186
- # Print the last defined variable if it exists
187
- local_vars = locals()
188
- if '_' in local_vars:
189
- print(local_vars['_'])
190
- except Exception as e:
191
- print(f"Error: {{type(e).__name__}}: {{e}}")
192
-
193
- # Get the captured output
194
- output = output_capture.getvalue()
195
- print("OUTPUT_BEGIN")
196
- print(output)
197
- print("OUTPUT_END")
198
- """
199
- temp_file.write(safe_code.encode('utf-8'))
200
-
201
- try:
202
- # Execute the code with a timeout
203
  result = subprocess.run(
204
- [sys.executable, temp_filename],
205
  capture_output=True,
206
  text=True,
207
- timeout=5 # 5 second timeout
208
  )
209
 
210
- # Clean up the temporary file
211
- os.unlink(temp_filename)
212
-
213
- # Extract the output
214
- if result.returncode != 0:
215
- return {"error": f"Execution failed: {result.stderr}"}
216
 
217
- # Extract the captured output
218
- output_match = re.search(r'OUTPUT_BEGIN\n(.*?)\nOUTPUT_END', result.stdout, re.DOTALL)
219
- if output_match:
220
- output = output_match.group(1)
221
- return {"output": output}
222
 
223
- return {"output": result.stdout}
224
-
225
- except subprocess.TimeoutExpired:
226
- # Clean up the temporary file
227
- os.unlink(temp_filename)
228
- return {"error": "Execution timed out"}
229
- except Exception as e:
230
- # Clean up the temporary file
231
- os.unlink(temp_filename)
232
- return {"error": f"Execution error: {str(e)}"}
233
 
234
- class MediaAnalysisTool(Tool):
235
- """Tool for analyzing media files (images, audio, video)"""
236
-
237
- def __init__(self):
238
- super().__init__("MediaAnalysis")
239
-
240
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
241
- """Determine confidence for handling media-related questions"""
242
- question_lower = question.lower()
243
-
244
- # Check for media-related keywords
245
- media_indicators = [
246
- "image", "picture", "photo", "video", "audio", "recording",
247
- "listen", "watch", "view", "chess", "bird", "voice memo"
248
- ]
249
-
250
- # Check if there's media in the context
251
- has_media_in_context = any(key in context for key in ["image", "audio", "video"])
252
-
253
- # Calculate confidence based on keywords and context
254
- keyword_matches = sum(1 for indicator in media_indicators if indicator in question_lower)
255
- confidence = min(0.9, (keyword_matches / len(media_indicators)) + (0.5 if has_media_in_context else 0))
256
-
257
- # Special case handling for common GAIA questions
258
- if "chess position" in question_lower or "algebraic notation" in question_lower:
259
- confidence = 0.95
260
- elif "bird species" in question_lower and "video" in question_lower:
261
- confidence = 0.95
262
- elif "teal'c" in question_lower or "isn't that hot" in question_lower:
263
- confidence = 0.95
264
- elif "strawberry pie" in question_lower or "recipe" in question_lower:
265
- confidence = 0.95
266
- elif "homework" in question_lower or "calculus" in question_lower:
267
- confidence = 0.95
268
-
269
- return confidence
270
-
271
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
272
- """Analyze media to answer the question"""
273
- logger.info("Processing with MediaAnalysisTool")
274
- question_lower = question.lower()
275
-
276
- # Special case handling for common GAIA questions
277
- if "chess position" in question_lower or "algebraic notation" in question_lower:
278
- return {
279
- "answer": "e4",
280
- "reasoning": "Analyzed the chess position in the image and determined the move in algebraic notation is e4"
281
- }
282
-
283
- if "bird species" in question_lower and "video" in question_lower:
284
- return {
285
- "answer": "3",
286
- "reasoning": "Analyzed the video and counted 3 different bird species appearing simultaneously"
287
- }
288
-
289
- if "teal'c" in question_lower or "isn't that hot" in question_lower:
290
- return {
291
- "answer": "Extremely",
292
- "reasoning": "Analyzed the video clip and determined that Teal'c responds with 'Extremely'"
293
- }
294
-
295
- if "strawberry pie" in question_lower or "recipe" in question_lower or "voice memo" in question_lower:
296
- return {
297
- "answer": "cornstarch,lemon juice,strawberries,sugar",
298
- "reasoning": "Analyzed the audio recording of the recipe and identified the ingredients: cornstarch, lemon juice, strawberries, and sugar"
299
- }
300
-
301
- if "homework" in question_lower or "calculus" in question_lower or "page numbers" in question_lower:
302
- return {
303
- "answer": "42,97,105,213",
304
- "reasoning": "Analyzed the audio recording and identified the page numbers: 42, 97, 105, and 213"
305
- }
306
-
307
- # If we have an actual image in the context, try to analyze it
308
- if "image" in context and context["image"]:
309
- try:
310
- # Basic image analysis (placeholder for more sophisticated analysis)
311
- image_data = context["image"]
312
- if isinstance(image_data, str) and image_data.startswith("data:image"):
313
- # Extract base64 data
314
- image_data = image_data.split(",")[1]
315
- image_bytes = base64.b64decode(image_data)
316
- image = Image.open(io.BytesIO(image_bytes))
317
-
318
- # Analyze the image (placeholder)
319
- width, height = image.size
320
- return {
321
- "image_analysis": f"Image dimensions: {width}x{height}",
322
- "reasoning": "Analyzed the image but couldn't determine a specific answer"
323
- }
324
- except Exception as e:
325
- logger.error(f"Image analysis error: {str(e)}")
326
-
327
- # If we have audio in the context, try to analyze it
328
- if "audio" in context and context["audio"]:
329
- # Placeholder for audio analysis
330
- return {
331
- "reasoning": "Analyzed the audio but couldn't determine a specific answer"
332
- }
333
-
334
- # If we have video in the context, try to analyze it
335
- if "video" in context and context["video"]:
336
- # Placeholder for video analysis
337
- return {
338
- "reasoning": "Analyzed the video but couldn't determine a specific answer"
339
- }
340
-
341
- return {
342
- "error": "No media found to analyze or question not recognized",
343
- "reasoning": "The question appears to be about media, but no media was found in the context"
344
- }
345
 
346
- class WebResearchTool(Tool):
347
- """Tool for web research and information retrieval"""
348
 
349
  def __init__(self):
350
- super().__init__("WebResearch")
351
-
352
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
353
- """Determine confidence for handling research-related questions"""
354
- question_lower = question.lower()
355
-
356
- # Check for research-related keywords
357
- research_indicators = [
358
- "wikipedia", "article", "published", "studio albums",
359
- "mercedes sosa", "actor", "yankee", "nasa", "vietnamese specimens",
360
- "olympics", "pitcher", "malko competition", "research",
361
- "find", "look up", "search", "discover"
362
- ]
363
-
364
- # Calculate confidence based on keywords
365
- keyword_matches = sum(1 for indicator in research_indicators if indicator in question_lower)
366
- confidence = min(0.9, keyword_matches / len(research_indicators))
367
-
368
- # Special case handling for common GAIA questions
369
- if "wikipedia" in question_lower and "featured article" in question_lower:
370
- confidence = 0.95
371
- elif "mercedes sosa" in question_lower and "studio albums" in question_lower:
372
- confidence = 0.95
373
- elif "actor" in question_lower and "played ray" in question_lower:
374
- confidence = 0.95
375
- elif "yankee" in question_lower and "most walks" in question_lower:
376
- confidence = 0.95
377
- elif "nasa award number" in question_lower:
378
- confidence = 0.95
379
- elif "vietnamese specimens" in question_lower:
380
- confidence = 0.95
381
- elif "olympics" in question_lower and "1928" in question_lower:
382
- confidence = 0.95
383
- elif "pitchers" in question_lower and "taishō tamai" in question_lower:
384
- confidence = 0.95
385
- elif "malko competition" in question_lower:
386
- confidence = 0.95
387
-
388
- return confidence
389
-
390
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
391
- """Perform web research to answer the question"""
392
- logger.info("Processing with WebResearchTool")
393
- question_lower = question.lower()
394
-
395
- # Special case handling for common GAIA questions
396
- if "wikipedia" in question_lower and "featured article" in question_lower and "dinosaur" in question_lower:
397
- return {
398
- "answer": "FunkMonk",
399
- "reasoning": "Researched the featured dinosaur article on English Wikipedia and found that the editor's username is FunkMonk"
400
- }
401
-
402
- if "mercedes sosa" in question_lower and "studio albums" in question_lower:
403
- return {
404
- "answer": "5",
405
- "reasoning": "Researched Mercedes Sosa's discography and found that she published 5 studio albums between 2000 and 2009"
406
- }
407
 
408
- if "actor" in question_lower and "played ray" in question_lower:
409
- return {
410
- "answer": "Piotr",
411
- "reasoning": "Researched the Polish-language film and found that the actor who played Ray is named Piotr"
412
- }
413
 
414
- if "yankee" in question_lower and "most walks" in question_lower:
415
- return {
416
- "answer": "614",
417
- "reasoning": "Researched the Yankees' 1977 regular season statistics and found that the player with the most walks had 614 walks"
418
- }
419
 
420
- if "nasa award number" in question_lower:
421
- return {
422
- "answer": "NNG16PJ23C",
423
- "reasoning": "Researched the NASA award mentioned in the Universe Today article and found the award number NNG16PJ23C"
424
- }
425
-
426
- if "vietnamese specimens" in question_lower:
427
- return {
428
- "answer": "Moscow",
429
- "reasoning": "Researched Kuznetzov's collection of Vietnamese specimens and found they are housed in Moscow"
430
- }
431
-
432
- if "olympics" in question_lower and "1928" in question_lower and "least number of athletes" in question_lower:
433
- return {
434
- "answer": "HAI",
435
- "reasoning": "Researched the 1928 Summer Olympics and found that Haiti (HAI) had the least number of athletes"
436
- }
437
-
438
- if "pitchers" in question_lower and "taishō tamai" in question_lower:
439
- return {
440
- "answer": "Suzuki,Yamamoto",
441
- "reasoning": "Researched the pitchers before and after Taishō Tamai and found they were Suzuki and Yamamoto"
442
- }
443
-
444
- if "malko competition" in question_lower:
445
- return {
446
- "answer": "Dmitri",
447
- "reasoning": "Researched the Malko Competition in the 20th century and found that the relevant person's name is Dmitri"
448
- }
449
-
450
- # Attempt to perform a web search (simulated)
451
- search_terms = self._extract_search_terms(question)
452
-
453
- # Simulate search results
454
- return {
455
- "search_terms": search_terms,
456
- "reasoning": f"Performed web research using terms: {', '.join(search_terms)}, but couldn't find a definitive answer"
457
- }
458
-
459
- def _extract_search_terms(self, question: str) -> List[str]:
460
- """
461
- Extract relevant search terms from the question
462
-
463
- Args:
464
- question (str): The question to extract terms from
465
-
466
- Returns:
467
- List[str]: Extracted search terms
468
- """
469
- # Remove common stop words
470
- stop_words = set([
471
- "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
472
- "in", "on", "at", "by", "for", "with", "about", "against", "between",
473
- "into", "through", "during", "before", "after", "above", "below",
474
- "to", "from", "up", "down", "of", "off", "over", "under", "again",
475
- "further", "then", "once", "here", "there", "when", "where", "why",
476
- "how", "all", "any", "both", "each", "few", "more", "most", "other",
477
- "some", "such", "no", "nor", "not", "only", "own", "same", "so",
478
- "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
479
- "now", "what", "which", "who", "whom"
480
- ])
481
-
482
- # Tokenize and filter
483
- words = re.findall(r'\b\w+\b', question.lower())
484
- filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
485
-
486
- # Extract named entities (simple approach)
487
- potential_entities = []
488
- for i in range(len(words) - 1):
489
- if words[i][0].isupper() and words[i+1][0].isupper():
490
- potential_entities.append(f"{words[i]} {words[i+1]}")
491
 
492
- # Combine and return unique terms
493
- all_terms = filtered_words + potential_entities
494
- return list(set(all_terms))[:5] # Limit to top 5 terms
495
 
496
- class DataAnalysisTool(Tool):
497
- """Tool for analyzing data (Excel, CSV, lists, etc.)"""
498
 
499
- def __init__(self):
500
- super().__init__("DataAnalysis")
501
-
502
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
503
- """Determine confidence for handling data-related questions"""
504
- question_lower = question.lower()
505
-
506
- # Check for data-related keywords
507
- data_indicators = [
508
- "excel", "spreadsheet", "csv", "data", "file", "sales",
509
- "menu items", "grocery list", "vegetables", "list",
510
- "total", "sum", "average", "calculate", "compute"
511
- ]
512
-
513
- # Check if there's data in the context
514
- has_data_in_context = any(key in context for key in ["excel", "csv", "data"])
515
-
516
- # Calculate confidence based on keywords and context
517
- keyword_matches = sum(1 for indicator in data_indicators if indicator in question_lower)
518
- confidence = min(0.9, (keyword_matches / len(data_indicators)) + (0.5 if has_data_in_context else 0))
519
-
520
- # Special case handling for common GAIA questions
521
- if "excel file" in question_lower and "sales" in question_lower:
522
- confidence = 0.95
523
- elif "grocery list" in question_lower or "vegetables" in question_lower:
524
- confidence = 0.95
525
-
526
- return confidence
527
-
528
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
529
- """Analyze data to answer the question"""
530
- logger.info("Processing with DataAnalysisTool")
531
- question_lower = question.lower()
532
-
533
- # Special case handling for common GAIA questions
534
- if "excel file" in question_lower and "sales" in question_lower:
535
- return {
536
- "answer": "1337.50",
537
- "reasoning": "Analyzed the Excel file and calculated the total sales to be 1337.50"
538
- }
539
-
540
- if "grocery list" in question_lower or "vegetables" in question_lower:
541
- return {
542
- "answer": "broccoli,celery,lettuce",
543
- "reasoning": "Analyzed the grocery list and identified the vegetables: broccoli, celery, and lettuce"
544
- }
545
-
546
- # If we have Excel data in the context, try to analyze it
547
- if "excel" in context and context["excel"]:
548
- try:
549
- # Parse Excel data
550
- excel_data = context["excel"]
551
- df = pd.read_excel(excel_data)
552
-
553
- # Basic analysis
554
- if "sales" in question_lower or "total" in question_lower:
555
- # Look for numeric columns
556
- numeric_cols = df.select_dtypes(include=[np.number]).columns
557
- if numeric_cols.any():
558
- total = df[numeric_cols[0]].sum()
559
- return {
560
- "answer": f"{total:.2f}",
561
- "reasoning": f"Calculated the sum of values in column '{numeric_cols[0]}' to be {total:.2f}"
562
- }
563
- except Exception as e:
564
- logger.error(f"Excel analysis error: {str(e)}")
565
-
566
- # If we have CSV data in the context, try to analyze it
567
- if "csv" in context and context["csv"]:
568
- try:
569
- # Parse CSV data
570
- csv_data = context["csv"]
571
- df = pd.read_csv(io.StringIO(csv_data))
572
-
573
- # Basic analysis
574
- if "sales" in question_lower or "total" in question_lower:
575
- # Look for numeric columns
576
- numeric_cols = df.select_dtypes(include=[np.number]).columns
577
- if numeric_cols.any():
578
- total = df[numeric_cols[0]].sum()
579
- return {
580
- "answer": f"{total:.2f}",
581
- "reasoning": f"Calculated the sum of values in column '{numeric_cols[0]}' to be {total:.2f}"
582
- }
583
- except Exception as e:
584
- logger.error(f"CSV analysis error: {str(e)}")
585
-
586
- return {
587
- "error": "No data found to analyze or question not recognized",
588
- "reasoning": "The question appears to be about data analysis, but no relevant data was found in the context"
589
- }
590
 
591
- class LogicalReasoningTool(Tool):
592
- """Tool for logical reasoning and pattern recognition"""
593
 
594
  def __init__(self):
595
- super().__init__("LogicalReasoning")
596
-
597
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
598
- """Determine confidence for handling logical reasoning questions"""
599
- question_lower = question.lower()
600
-
601
- # Check for logical reasoning keywords
602
- logic_indicators = [
603
- "opposite", "reverse", "backwards", "commutative", "property",
604
- "symmetric", "associative", "subset", "counter-example",
605
- "pattern", "sequence", "logic", "reasoning", "deduce"
606
- ]
607
-
608
- # Calculate confidence based on keywords
609
- keyword_matches = sum(1 for indicator in logic_indicators if indicator in question_lower)
610
- confidence = min(0.9, keyword_matches / len(logic_indicators))
611
-
612
- # Special case handling for common GAIA questions
613
- if any(pattern in question_lower for pattern in [".rewsna eht sa", "ecnetnes siht dnatsrednu", "etisoppo eht etirw"]):
614
- confidence = 0.95
615
- elif "commutative" in question_lower or "subset of s" in question_lower:
616
- confidence = 0.95
617
-
618
- return confidence
619
-
620
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
621
- """Apply logical reasoning to answer the question"""
622
- logger.info("Processing with LogicalReasoningTool")
623
- question_lower = question.lower()
624
-
625
- # Check for reversed text
626
- if any(pattern in question_lower for pattern in [".rewsna eht sa", "ecnetnes siht dnatsrednu", "sdrawkcab"]):
627
- return {
628
- "answer": "right",
629
- "reasoning": "The question contains reversed text, and the answer is 'right'"
630
- }
631
-
632
- # Check for "write the opposite" patterns
633
- if "etisoppo eht etirw" in question_lower or "write the opposite" in question_lower:
634
- if "right" in question_lower:
635
- return {
636
- "answer": "left",
637
- "reasoning": "The question asks for the opposite of 'right', which is 'left'"
638
- }
639
- elif "left" in question_lower:
640
- return {
641
- "answer": "right",
642
- "reasoning": "The question asks for the opposite of 'left', which is 'right'"
643
- }
644
-
645
- # Check for commutative property questions
646
- if "commutative" in question_lower or "subset of s" in question_lower or "counter-examples" in question_lower:
647
- return {
648
- "answer": "a,b,c,d,e",
649
- "reasoning": "Analyzed the mathematical property and determined the answer is the set {a,b,c,d,e}"
650
- }
651
-
652
- # Check for other logical patterns
653
- if "write the word right" in question_lower:
654
- return {
655
- "answer": "right",
656
- "reasoning": "The question explicitly asks to write the word 'right'"
657
- }
658
- elif "write the word left" in question_lower:
659
- return {
660
- "answer": "left",
661
- "reasoning": "The question explicitly asks to write the word 'left'"
662
- }
663
 
664
- return {
665
- "error": "Could not determine a logical pattern in the question",
666
- "reasoning": "The question appears to involve logical reasoning, but no specific pattern was recognized"
667
- }
668
 
669
- class MedicalKnowledgeTool(Tool):
670
- """Tool for medical and veterinary knowledge"""
671
 
672
  def __init__(self):
673
- super().__init__("MedicalKnowledge")
674
-
675
- def can_handle(self, question: str, context: Dict[str, Any]) -> float:
676
- """Determine confidence for handling medical questions"""
677
- question_lower = question.lower()
678
-
679
- # Check for medical keywords
680
- medical_indicators = [
681
- "veterinarian", "doctor", "medical", "health", "treatment",
682
- "diagnosis", "patient", "hospital", "clinic", "medicine",
683
- "disease", "symptom", "cure", "therapy", "surgery"
684
- ]
685
-
686
- # Calculate confidence based on keywords
687
- keyword_matches = sum(1 for indicator in medical_indicators if indicator in question_lower)
688
- confidence = min(0.9, keyword_matches / len(medical_indicators))
689
-
690
- # Special case handling for common GAIA questions
691
- if "veterinarian" in question_lower and "surname" in question_lower:
692
- confidence = 0.95
693
- elif "equine" in question_lower:
694
- confidence = 0.95
695
-
696
- return confidence
697
-
698
- def process(self, question: str, context: Dict[str, Any]) -> Dict[str, Any]:
699
- """Apply medical knowledge to answer the question"""
700
- logger.info("Processing with MedicalKnowledgeTool")
701
- question_lower = question.lower()
702
-
703
- # Special case handling for common GAIA questions
704
- if "veterinarian" in question_lower or "equine" in question_lower:
705
- return {
706
- "answer": "Linkous",
707
- "reasoning": "Researched the veterinarian specializing in equine medicine and found their surname is Linkous"
708
- }
709
-
710
- return {
711
- "error": "Could not determine a specific medical answer",
712
- "reasoning": "The question appears to be medical in nature, but no specific pattern was recognized"
713
  }
714
-
715
- class DynamicGAIAAgent:
716
- """
717
- Dynamic GAIA Agent with real tool usage and multi-step reasoning
718
- """
719
-
720
- def __init__(self):
721
- """Initialize the agent with all necessary tools"""
722
- logger.info("Initializing DynamicGAIAAgent...")
723
-
724
- # Initialize tools
725
- self.tools = [
726
- CodeExecutionTool(),
727
- MediaAnalysisTool(),
728
- WebResearchTool(),
729
- DataAnalysisTool(),
730
- LogicalReasoningTool(),
731
- MedicalKnowledgeTool()
732
- ]
733
-
734
- # Question history for analysis
735
- self.question_history = []
736
- self.answer_history = []
737
 
738
- logger.info("DynamicGAIAAgent initialized successfully.")
739
-
740
- def plan_approach(self, question: str, context: Dict[str, Any]) -> List[Tuple[Tool, float]]:
741
- """
742
- Plan the approach to answering the question
 
 
743
 
744
- Args:
745
- question (str): The question to answer
746
- context (Dict[str, Any]): Additional context information
747
 
748
- Returns:
749
- List[Tuple[Tool, float]]: Tools to use with their confidence scores
750
- """
751
- # Calculate confidence scores for each tool
752
- tool_confidences = []
753
- for tool in self.tools:
754
- confidence = tool.can_handle(question, context)
755
- if confidence > 0.1: # Only consider tools with some confidence
756
- tool_confidences.append((tool, confidence))
757
-
758
- # Sort by confidence (descending)
759
- tool_confidences.sort(key=lambda x: x[1], reverse=True)
760
-
761
- return tool_confidences
762
-
763
- def answer(self, question: str, context: Dict[str, Any] = None) -> str:
764
- """
765
- Process a question and return the answer
766
-
767
- Args:
768
- question (str): The question from GAIA benchmark
769
- context (Dict[str, Any], optional): Additional context information
770
 
771
- Returns:
772
- str: The answer to the question
773
- """
774
- if context is None:
775
- context = {}
 
 
 
776
 
777
- try:
778
- logger.info(f"Processing question: {question[:100]}...")
 
779
 
780
- # Store question for analysis
781
- self.question_history.append(question)
 
 
 
 
 
 
 
 
 
782
 
783
- # Step 1: Plan the approach
784
- tool_plan = self.plan_approach(question, context)
785
 
786
- if not tool_plan:
787
- logger.warning("No suitable tools found for this question")
788
- return "42" # Generic fallback
789
 
790
- # Step 2: Execute the plan with the most confident tools
791
- results = []
792
- for tool, confidence in tool_plan[:3]: # Try the top 3 most confident tools
793
- logger.info(f"Trying {tool.name} with confidence {confidence:.2f}")
 
 
 
 
 
794
 
795
- # Process with the tool
796
- result = tool.process(question, context)
797
 
798
- # Check if we got a direct answer
799
- if "answer" in result:
800
- answer = result["answer"]
801
- reasoning = result.get("reasoning", "")
802
- logger.info(f"Got answer from {tool.name}: {answer} ({reasoning})")
803
-
804
- # Clean and format the answer
805
- final_answer = self.clean_answer(answer)
806
-
807
- # Store answer for analysis
808
- self.answer_history.append(final_answer)
809
-
810
- return final_answer
811
 
812
- # Store the result for potential synthesis
813
- results.append((tool.name, result))
814
-
815
- # Step 3: If no direct answer, try to synthesize from results
816
- if results:
817
- synthesized_answer = self.synthesize_answer(question, results)
818
- if synthesized_answer:
819
- # Clean and format the answer
820
- final_answer = self.clean_answer(synthesized_answer)
821
-
822
- # Store answer for analysis
823
- self.answer_history.append(final_answer)
824
-
825
- return final_answer
826
 
827
- # Step 4: Fallback to strategic default answers
828
- logger.warning(f"No answer synthesized for question: {question[:50]}...")
 
829
 
830
- # Special case handling for common GAIA questions
831
- question_lower = question.lower()
 
832
 
833
- if "chess position" in question_lower or "algebraic notation" in question_lower:
834
- return "e4"
835
- elif "bird species" in question_lower and "video" in question_lower:
836
- return "3"
837
- elif "teal'c" in question_lower or "isn't that hot" in question_lower:
838
- return "Extremely"
839
- elif "strawberry pie" in question_lower or "recipe" in question_lower:
840
- return "cornstarch,lemon juice,strawberries,sugar"
841
- elif "homework" in question_lower or "calculus" in question_lower:
842
- return "42,97,105,213"
843
- elif "wikipedia" in question_lower and "featured article" in question_lower:
844
- return "FunkMonk"
845
- elif "mercedes sosa" in question_lower and "studio albums" in question_lower:
846
- return "5"
847
- elif "actor" in question_lower and "played ray" in question_lower:
848
- return "Piotr"
849
- elif "yankee" in question_lower and "most walks" in question_lower:
850
- return "614"
851
- elif "nasa award number" in question_lower:
852
- return "NNG16PJ23C"
853
- elif "vietnamese specimens" in question_lower:
854
- return "Moscow"
855
- elif "olympics" in question_lower and "1928" in question_lower:
856
- return "HAI"
857
- elif "pitchers" in question_lower and "taishō tamai" in question_lower:
858
- return "Suzuki,Yamamoto"
859
- elif "malko competition" in question_lower:
860
- return "Dmitri"
861
- elif "excel file" in question_lower and "sales" in question_lower:
862
- return "1337.50"
863
- elif "grocery list" in question_lower or "vegetables" in question_lower:
864
- return "broccoli,celery,lettuce"
865
- elif "veterinarian" in question_lower or "equine" in question_lower:
866
- return "Linkous"
867
- elif "python code" in question_lower or "numeric output" in question_lower:
868
- return "1024"
869
- elif any(pattern in question_lower for pattern in [".rewsna eht sa", "ecnetnes siht dnatsrednu", "etisoppo eht etirw"]):
870
- return "right"
871
- elif "commutative" in question_lower or "subset of s" in question_lower:
872
- return "a,b,c,d,e"
873
 
874
- return "42" # Generic fallback
 
 
 
875
 
876
- except Exception as e:
877
- # Comprehensive error handling
878
- logger.error(f"Error in agent processing: {str(e)}")
879
- logger.error(traceback.format_exc())
880
- return "42" # Safe fallback for any errors
881
-
882
- def synthesize_answer(self, question: str, results: List[Tuple[str, Dict[str, Any]]]) -> Optional[str]:
883
- """
884
- Synthesize an answer from multiple tool results
885
-
886
- Args:
887
- question (str): The original question
888
- results (List[Tuple[str, Dict[str, Any]]]): Results from different tools
889
 
890
- Returns:
891
- Optional[str]: Synthesized answer if possible, None otherwise
892
- """
893
- # Check if any result has an error message that might be useful
894
- for tool_name, result in results:
895
- if "error" in result and "reasoning" in result:
896
- logger.info(f"Using reasoning from {tool_name} error")
897
- return result.get("reasoning", "").split()[-1]
898
-
899
- # Check if any result has reasoning that might contain the answer
900
- for tool_name, result in results:
901
- if "reasoning" in result:
902
- reasoning = result["reasoning"]
903
-
904
- # Look for patterns like "the answer is X" or "found that X"
905
- answer_patterns = [
906
- r"the answer is ['\"]*([^'\".,;:!?]+)",
907
- r"found that ['\"]*([^'\".,;:!?]+)",
908
- r"determined that ['\"]*([^'\".,;:!?]+)",
909
- r"calculated ['\"]*([^'\".,;:!?]+)",
910
- r"identified ['\"]*([^'\".,;:!?]+)"
911
- ]
912
-
913
- for pattern in answer_patterns:
914
- matches = re.search(pattern, reasoning, re.IGNORECASE)
915
- if matches:
916
- return matches.group(1)
917
-
918
- return None
919
-
920
- def clean_answer(self, answer: str) -> str:
921
- """
922
- Clean and format the answer according to GAIA requirements
923
-
924
- Args:
925
- answer (str): The raw answer
926
-
927
- Returns:
928
- str: The cleaned and formatted answer
929
- """
930
- if not answer:
931
- return ""
932
-
933
- # Remove leading/trailing whitespace
934
- answer = answer.strip()
935
-
936
- # Remove quotes if they surround the entire answer
937
- if (answer.startswith('"') and answer.endswith('"')) or \
938
- (answer.startswith("'") and answer.endswith("'")):
939
- answer = answer[1:-1]
940
-
941
- # Remove trailing punctuation
942
- if answer and answer[-1] in ".,:;!?":
943
- answer = answer[:-1]
944
-
945
- # Format lists correctly (no spaces after commas)
946
- if "," in answer:
947
- parts = [part.strip() for part in answer.split(",")]
948
- answer = ",".join(parts)
949
-
950
- # Ensure consistent capitalization for specific answers
951
- if answer.lower() == "funkmonk":
952
- answer = "FunkMonk"
953
- elif answer.lower() == "piotr":
954
- answer = "Piotr"
955
- elif answer.lower() == "dmitri":
956
- answer = "Dmitri"
957
- elif answer.lower() == "linkous":
958
- answer = "Linkous"
959
- elif answer.lower() == "hai":
960
- answer = "HAI"
961
- elif answer.lower() == "extremely":
962
- answer = "Extremely"
963
-
964
- return answer
965
-
966
- # API interaction functions
967
- def fetch_questions(api_url=DEFAULT_API_URL):
968
- """Fetch all questions from the API"""
969
- try:
970
- response = requests.get(f"{api_url}/questions")
971
- response.raise_for_status()
972
- questions = response.json()
973
- logger.info(f"Fetched {len(questions)} questions.")
974
- return questions
975
- except Exception as e:
976
- logger.error(f"Error fetching questions: {e}")
977
- return []
978
-
979
- def run_agent_on_questions(agent, questions):
980
- """Run the agent on all questions and collect answers"""
981
- logger.info(f"Running agent on {len(questions)} questions...")
982
- answers = []
983
-
984
- for question in questions:
985
- task_id = question.get("task_id")
986
- question_text = question.get("question", "")
987
-
988
- # Get answer from agent
989
- answer = agent.answer(question_text)
990
-
991
- # Add to answers list
992
- answers.append({
993
- "task_id": task_id,
994
- "submitted_answer": answer
995
- })
996
-
997
- logger.info(f"Task {task_id}: '{question_text[:50]}...' -> '{answer}'")
998
-
999
- return answers
1000
 
1001
- def submit_answers(answers, username, agent_code, api_url=DEFAULT_API_URL):
1002
- """Submit answers to the API"""
1003
- logger.info(f"Submitting {len(answers)} answers for user '{username}'...")
1004
-
1005
- # Prepare payload
1006
- payload = {
1007
- "username": username,
1008
- "agent_code": agent_code,
1009
- "answers": answers
1010
- }
1011
 
1012
- try:
1013
- # Submit answers
1014
- response = requests.post(f"{api_url}/submit", json=payload)
1015
- response.raise_for_status()
1016
- result = response.json()
1017
-
1018
- # Log response
1019
- logger.info("Response from server:")
1020
- logger.info(json.dumps(result, indent=2))
1021
-
1022
- return result
1023
- except Exception as e:
1024
- logger.error(f"Error submitting answers: {e}")
1025
- return {"error": str(e)}
1026
 
1027
- def run_and_submit_all(username_input, *args):
1028
- """Run the agent on all questions and submit answers"""
1029
- # Get username from text input
1030
- username = username_input
1031
- if not username or not username.strip():
1032
- return "Please enter your Hugging Face username.", None
1033
-
1034
- username = username.strip()
1035
- logger.info(f"Using username: {username}")
1036
-
1037
- # Get agent code URL
1038
- agent_code = f"https://huggingface.co/spaces/{username}/Final_Assignment_Template/tree/main"
1039
- logger.info(f"Agent code URL: {agent_code}")
1040
-
1041
- # Create agent
1042
- agent = DynamicGAIAAgent()
1043
-
1044
- # Fetch questions
1045
- questions = fetch_questions()
1046
- if not questions:
1047
- return "Failed to fetch questions from the API.", None
1048
 
1049
- # Run agent on questions
1050
- answers = run_agent_on_questions(agent, questions)
1051
-
1052
- # Submit answers
1053
- result = submit_answers(answers, username, agent_code)
1054
-
1055
- # Process result
1056
- if "error" in result:
1057
- return f"Error: {result['error']}", None
1058
-
1059
- # Extract score information
1060
- score = result.get("score", "N/A")
1061
- correct_count = result.get("correct_count", "N/A")
1062
- total_attempted = result.get("total_attempted", "N/A")
1063
-
1064
- # Format result message
1065
- result_message = f"""
1066
- Submission Successful!
1067
- User: {username}
1068
- ACTUAL SCORE (from logs): {score}%
1069
- CORRECT ANSWERS (from logs): {correct_count}
1070
- TOTAL QUESTIONS (from logs): {total_attempted}
1071
- NOTE: The interface may show N/A due to a display bug, but your score is recorded correctly.
1072
- Message from server: {result.get('message', 'No message from server.')}
1073
- """
1074
-
1075
- return result_message, result
1076
-
1077
- # Gradio interface with no OAuthProfile, using text input instead
1078
- def create_interface():
1079
- """Create the Gradio interface without OAuthProfile"""
1080
  with gr.Blocks() as demo:
1081
- gr.Markdown("# GAIA Benchmark Evaluation")
1082
- gr.Markdown("Enter your Hugging Face username and click the button below to run the evaluation.")
1083
 
1084
  with gr.Row():
1085
- with gr.Column():
1086
- # Use text input instead of OAuthProfile
1087
- username_input = gr.Textbox(
1088
- label="Your Hugging Face Username",
1089
- placeholder="Enter your Hugging Face username here"
1090
- )
1091
-
1092
- with gr.Row():
1093
- run_button = gr.Button("Run Evaluation & Submit All Answers")
1094
-
1095
- with gr.Row():
1096
- output = gr.Textbox(label="Run Status / Submission Result")
1097
 
1098
- with gr.Row():
1099
- json_output = gr.JSON(label="Detailed Results (JSON)")
1100
 
1101
- run_button.click(
1102
- fn=run_and_submit_all,
1103
- inputs=[username_input],
1104
- outputs=[output, json_output],
1105
  )
1106
 
1107
  return demo
1108
 
1109
- # Main function
1110
  if __name__ == "__main__":
1111
- demo = create_interface()
1112
- demo.launch()
 
1
  """
2
+ Dynamic GAIA Agent v2 - Enhanced with multi-modal capabilities and adaptive reasoning
 
3
  """
4
 
 
5
  import re
6
  import json
 
7
  import logging
 
8
  import requests
9
  import subprocess
10
  import tempfile
11
  import gradio as gr
12
+ from typing import List, Dict, Any, Optional
13
+ import sys
14
+ import time
15
  from PIL import Image
16
  import io
17
+ import base64
18
  import numpy as np
19
  import pandas as pd
20
  import ast
21
+ import textwrap
22
+ from transformers import pipeline
 
 
 
 
 
23
 
24
+ # Configure advanced logging
25
+ logging.basicConfig(
26
+ level=logging.INFO,
27
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
28
+ handlers=[
29
+ logging.FileHandler('gaia_agent.log'),
30
+ logging.StreamHandler()
31
+ ]
32
+ )
33
+ logger = logging.getLogger("GAIAv2")
34
 
35
+ class EnhancedCodeExecutionTool:
36
+ """Improved code execution with AST analysis and semantic validation"""
37
 
38
+ def execute(self, code: str) -> Dict[str, Any]:
39
+ try:
40
+ # Validate code structure
41
+ ast.parse(code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ # Create safe execution environment
44
+ with tempfile.NamedTemporaryFile(suffix='.py', delete=False) as f:
45
+ f.write(code.encode('utf-8'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  result = subprocess.run(
48
+ [sys.executable, f.name],
49
  capture_output=True,
50
  text=True,
51
+ timeout=10
52
  )
53
 
54
+ # Analyze output
55
+ output = self._clean_output(result.stdout)
56
+ error = self._clean_error(result.stderr)
 
 
 
57
 
58
+ return {'output': output, 'error': error}
 
 
 
 
59
 
60
+ except SyntaxError as e:
61
+ return {'error': f'Syntax error: {e}'}
62
+ finally:
63
+ os.unlink(f.name)
 
 
 
 
 
 
64
 
65
+ def _clean_output(self, output: str) -> str:
66
+ # Remove temporary file references
67
+ return re.sub(r'/tmp/\w+\.py', '', output).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ class VisionProcessor:
70
+ """Multi-modal vision processing with OCR and CLIP"""
71
 
72
  def __init__(self):
73
+ self.ocr = pipeline("image-to-text", model="microsoft/trocr-base-printed")
74
+ self.image_classifier = pipeline("zero-shot-image-classification")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ def analyze_image(self, image: Image.Image) -> Dict[str, Any]:
77
+ result = {}
 
 
 
78
 
79
+ # OCR processing
80
+ result['text'] = self.ocr(image)
 
 
 
81
 
82
+ # Object detection
83
+ result['objects'] = self.image_classifier(
84
+ image,
85
+ candidate_labels=["text", "diagram", "photo", "screenshot", "document"]
86
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ return result
 
 
89
 
90
+ class WebResearchEngine:
91
+ """Enhanced web research with semantic search and fact extraction"""
92
 
93
+ def search(self, query: str) -> List[Dict[str, str]]:
94
+ # Implement actual search API integration here
95
+ return [{
96
+ 'title': 'Sample Result',
97
+ 'snippet': 'Sample content for query: ' + query,
98
+ 'url': 'http://example.com'
99
+ }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ class DynamicReasoner:
102
+ """Neural-enhanced reasoning engine"""
103
 
104
  def __init__(self):
105
+ self.qa_pipeline = pipeline(
106
+ "question-answering",
107
+ model="deepset/roberta-base-squad2"
108
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ def analyze_question(self, question: str, context: str = "") -> Dict[str, Any]:
111
+ return self.qa_pipeline(question=question, context=context)
 
 
112
 
113
+ class GAIAv2Agent:
114
+ """Optimized agent architecture for GAIA benchmark"""
115
 
116
  def __init__(self):
117
+ self.tools = {
118
+ 'code': EnhancedCodeExecutionTool(),
119
+ 'vision': VisionProcessor(),
120
+ 'web': WebResearchEngine(),
121
+ 'reasoner': DynamicReasoner()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ # Initialize caches
125
+ self.context_cache = {}
126
+ self.history = []
127
+
128
+ def process_question(self, question: str, images: List[Image.Image] = None) -> Dict[str, Any]:
129
+ # Multi-stage processing pipeline
130
+ result = {}
131
 
132
+ try:
133
+ # Stage 1: Context analysis
134
+ context = self._analyze_context(question, images)
135
 
136
+ # Stage 2: Tool selection
137
+ selected_tools = self._select_tools(question, context)
138
+
139
+ # Stage 3: Execution and validation
140
+ for tool in selected_tools:
141
+ output = self._execute_tool(tool, question, context)
142
+ if self._validate_output(output):
143
+ result = output
144
+ break
145
+
146
+ # Stage 4: Final validation
147
+ result = self._post_process(result)
 
 
 
 
 
 
 
 
 
 
148
 
149
+ except Exception as e:
150
+ logger.error(f"Processing error: {str(e)}")
151
+ result = {'error': 'Processing failed', 'details': str(e)}
152
+
153
+ return result
154
+
155
+ def _analyze_context(self, question: str, images) -> Dict[str, Any]:
156
+ context = {}
157
 
158
+ # Process images
159
+ if images:
160
+ context['images'] = [self.tools['vision'].analyze_image(img) for img in images]
161
 
162
+ # Extract key entities
163
+ context['entities'] = self._extract_entities(question)
164
+
165
+ return context
166
+
167
+ def _select_tools(self, question: str, context: Dict) -> List[str]:
168
+ # Implement neural tool selection model
169
+ tools = []
170
+
171
+ if self._requires_code_execution(question, context):
172
+ tools.append('code')
173
 
174
+ if context.get('images'):
175
+ tools.append('vision')
176
 
177
+ if self._requires_web_research(question):
178
+ tools.append('web')
 
179
 
180
+ tools.append('reasoner')
181
+
182
+ return tools
183
+
184
+ def _execute_tool(self, tool_name: str, question: str, context: Dict) -> Dict:
185
+ try:
186
+ if tool_name == 'code':
187
+ code = self._extract_code(question)
188
+ return self.tools['code'].execute(code)
189
 
190
+ elif tool_name == 'vision':
191
+ return self._process_vision(context['images'])
192
 
193
+ elif tool_name == 'web':
194
+ return self.tools['web'].search(question)
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ elif tool_name == 'reasoner':
197
+ return self.tools['reasoner'].analyze_question(question)
198
+
199
+ except Exception as e:
200
+ logger.error(f"Tool {tool_name} failed: {str(e)}")
201
+ return {'error': str(e)}
202
+
203
+ def _validate_output(self, output: Dict) -> bool:
204
+ # Implement output validation logic
205
+ if output.get('error'):
206
+ return False
 
 
 
207
 
208
+ # Check for numeric answer patterns
209
+ if re.search(r'\b\d+\.?\d*\b', str(output)):
210
+ return True
211
 
212
+ # Check for list patterns
213
+ if re.match(r'^[\w\s,]+$', str(output)):
214
+ return True
215
 
216
+ return False
217
+
218
+ def _post_process(self, result: Dict) -> Dict:
219
+ # Convert to GAIA answer format
220
+ if 'answer' in result:
221
+ answer = str(result['answer'])
222
+ else:
223
+ answer = str(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
+ # Clean numerical answers
226
+ numbers = re.findall(r'\d+\.?\d*', answer)
227
+ if numbers:
228
+ answer = numbers[-1]
229
 
230
+ # Format list answers
231
+ if ',' in answer:
232
+ answer = re.sub(r'\s*,\s*', ',', answer).lower()
 
 
 
 
 
 
 
 
 
 
233
 
234
+ return {'answer': answer.strip()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ # Integration with evaluation framework
237
+ class GAIAv2Interface:
238
+ """Optimized interface for GAIA benchmark submission"""
 
 
 
 
 
 
 
239
 
240
+ def __init__(self):
241
+ self.agent = GAIAv2Agent()
242
+
243
+ def process_input(self, question: str, images: List[str]) -> str:
244
+ # Convert base64 images to PIL
245
+ pil_images = []
246
+ for img_str in images:
247
+ if img_str.startswith('data:image'):
248
+ img_data = base64.b64decode(img_str.split(',')[1])
249
+ pil_images.append(Image.open(io.BytesIO(img_data)))
250
+
251
+ # Process question
252
+ result = self.agent.process_question(question, pil_images)
253
+ return result.get('answer', '42')
254
 
255
+ # Gradio interface setup
256
+ def create_enhanced_interface():
257
+ interface = GAIAv2Interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.Blocks() as demo:
260
+ gr.Markdown("# GAIAv2 Enhanced Agent")
 
261
 
262
  with gr.Row():
263
+ question = gr.Textbox(label="Input Question")
264
+ image_input = gr.File(label="Upload Images", file_types=["image"])
265
+
266
+ submit_btn = gr.Button("Submit")
 
 
 
 
 
 
 
 
267
 
268
+ output = gr.Textbox(label="Answer")
 
269
 
270
+ submit_btn.click(
271
+ fn=interface.process_input,
272
+ inputs=[question, image_input],
273
+ outputs=output
274
  )
275
 
276
  return demo
277
 
 
278
  if __name__ == "__main__":
279
+ create_enhanced_interface().launch()