Spaces:

Ashokdll
/

agent_unit4

Running

App Files Files Community

Ashokdll commited on Jun 3

Commit

47415c5

verified ·

1 Parent(s): 3a8f426

Create app.py

Browse files

Files changed (1) hide show

app.py +845 -0

app.py ADDED Viewed

	@@ -0,0 +1,845 @@

+import requests
+import json
+import re
+import ast
+import operator
+from typing import Dict, List, Any, Optional
+import time
+class GAIAAgent:
+    def __init__(self, api_base_url: str):
+        self.api_base_url = api_base_url
+        self.tools = self._initialize_tools()
+        self.max_retries = 3
+        self.timeout = 30
+    def _initialize_tools(self):
+        """Initialize all available tools"""
+        return {
+            'web_search': WebSearchTool(),
+            'calculator': CalculatorTool(),
+            'file_processor': FileProcessorTool(self.api_base_url),
+            'text_analyzer': TextAnalyzerTool()
+        }
+    def solve_question(self, question_data: Dict) -> str:
+        """Main pipeline to solve a GAIA question"""
+        try:
+            # Step 1: Analyze the question
+            analysis = self._analyze_question(question_data)
+            print(f"Question analysis: {analysis}")
+            # Step 2: Create execution plan
+            plan = self._create_execution_plan(analysis, question_data)
+            print(f"Execution plan: {[step['action'] for step in plan]}")
+            # Step 3: Execute plan
+            results = self._execute_plan(plan, question_data)
+            print(f"Execution results keys: {list(results.keys())}")
+            # Step 4: Generate final answer
+            final_answer = self._generate_final_answer(results, question_data)
+            # Step 5: Format and validate answer
+            formatted_answer = self._format_final_answer(final_answer)
+            return formatted_answer
+        except Exception as e:
+            print(f"Error solving question: {e}")
+            return "Unable to determine answer"
+    def _analyze_question(self, question_data: Dict) -> Dict:
+        """Analyze question to determine approach and required tools"""
+        question = question_data.get('question', '')
+        has_file = bool(question_data.get('file_name'))
+        # Classify question type
+        question_lower = question.lower()
+        analysis = {
+            'needs_calculation': any(word in question_lower for word in
+                                   ['calculate', 'compute', 'sum', 'total', 'average', 'count', 'multiply', 'divide']),
+            'needs_web_search': any(word in question_lower for word in
+                                  ['who', 'what', 'when', 'where', 'find', 'search', 'latest', 'current']),
+            'needs_file_processing': has_file,
+            'is_factual_question': any(word in question_lower for word in
+                                     ['who is', 'what is', 'when was', 'where is']),
+            'needs_analysis': any(word in question_lower for word in
+                                ['analyze', 'compare', 'determine', 'evaluate']),
+            'question_text': question,
+            'has_file': has_file,
+            'file_name': question_data.get('file_name', '')
+        }
+        return analysis
+    def _create_execution_plan(self, analysis: Dict, question_data: Dict) -> List[Dict]:
+        """Create step-by-step execution plan"""
+        plan = []
+        # Priority 1: Process files if they exist
+        if analysis['needs_file_processing']:
+            plan.append({
+                'action': 'process_file',
+                'tool': 'file_processor',
+                'priority': 1,
+                'params': {
+                    'task_id': question_data.get('task_id'),
+                    'file_name': question_data.get('file_name')
+                }
+            })
+        # Priority 2: Web search for factual information
+        if analysis['needs_web_search'] or analysis['is_factual_question']:
+            plan.append({
+                'action': 'web_search',
+                'tool': 'web_search',
+                'priority': 2,
+                'params': {
+                    'query': self._extract_search_query(analysis['question_text'])
+                }
+            })
+        # Priority 3: Calculations
+        if analysis['needs_calculation']:
+            plan.append({
+                'action': 'calculate',
+                'tool': 'calculator',
+                'priority': 3,
+                'params': {}
+            })
+        # Priority 4: Text analysis
+        plan.append({
+            'action': 'analyze_text',
+            'tool': 'text_analyzer',
+            'priority': 4,
+            'params': {
+                'text': analysis['question_text']
+            }
+        })
+        return sorted(plan, key=lambda x: x['priority'])
+    def _execute_plan(self, plan: List[Dict], question_data: Dict) -> Dict:
+        """Execute the planned steps"""
+        results = {}
+        for step in plan:
+            tool_name = step['tool']
+            action = step['action']
+            try:
+                print(f"Executing: {action}")
+                if action == 'process_file':
+                    results['file_data'] = self.tools[tool_name].process_file(
+                        step['params']['task_id'],
+                        step['params']['file_name']
+                    )
+                elif action == 'web_search':
+                    results['search_data'] = self.tools[tool_name].search(
+                        step['params']['query']
+                    )
+                elif action == 'calculate':
+                    # Extract numbers and operations from question and file data
+                    calculation_input = self._prepare_calculation_input(
+                        question_data, results
+                    )
+                    if calculation_input:
+                        results['calculation'] = self.tools[tool_name].calculate(
+                            calculation_input
+                        )
+                elif action == 'analyze_text':
+                    results['text_analysis'] = self.tools[tool_name].analyze(
+                        step['params']['text'],
+                        context=results
+                    )
+            except Exception as e:
+                print(f"Error in {action}: {e}")
+                results[f'{action}_error'] = str(e)
+        return results
+    def _extract_search_query(self, question: str) -> str:
+        """Extract relevant search query from question"""
+        # Remove question words and extract key terms
+        question_words = ['what', 'who', 'when', 'where', 'how', 'why', 'is', 'are', 'was', 'were']
+        words = question.lower().split()
+        # Keep important words, remove common question words
+        filtered_words = [word for word in words if word not in question_words and len(word) > 2]
+        return ' '.join(filtered_words[:6])  # Limit to 6 words
+    def _prepare_calculation_input(self, question_data: Dict, results: Dict) -> Optional[str]:
+        """Prepare input for calculator based on question and available data"""
+        question = question_data.get('question', '')
+        # Extract numbers from question
+        numbers = re.findall(r'\d+\.?\d*', question)
+        # Look for mathematical operations
+        if 'sum' in question.lower() or 'total' in question.lower():
+            if numbers:
+                return '+'.join(numbers)
+        elif 'multiply' in question.lower() or 'product' in question.lower():
+            if numbers:
+                return '*'.join(numbers)
+        elif 'average' in question.lower():
+            if numbers:
+                return f"({'+'.join(numbers)})/{len(numbers)}"
+        # Check if file data contains numbers for calculation
+        if 'file_data' in results and isinstance(results['file_data'], dict):
+            file_numbers = results['file_data'].get('numbers', [])
+            if file_numbers and ('sum' in question.lower() or 'total' in question.lower()):
+                return '+'.join(map(str, file_numbers))
+        return None
+    def _generate_final_answer(self, results: Dict, question_data: Dict) -> str:
+        """Generate final answer based on execution results"""
+        question = question_data.get('question', '').lower()
+        # Priority order for answer selection
+        if 'calculation' in results and results['calculation'] is not None:
+            return str(results['calculation'])
+        if 'file_data' in results and isinstance(results['file_data'], dict):
+            # Look for specific answer in file data
+            if 'answer' in results['file_data']:
+                return str(results['file_data']['answer'])
+            elif 'summary' in results['file_data']:
+                return str(results['file_data']['summary'])
+        if 'search_data' in results and results['search_data']:
+            # Extract answer from search results
+            for result in results['search_data']:
+                if isinstance(result, dict) and 'summary' in result:
+                    return result['summary']
+        if 'text_analysis' in results:
+            return str(results['text_analysis'])
+        return "Unable to determine answer"
+    def _format_final_answer(self, answer: str) -> str:
+        """Format the final answer for exact match scoring"""
+        if not answer:
+            return "No answer found"
+        # Convert to string and strip whitespace
+        answer = str(answer).strip()
+        # Remove common prefixes that might cause exact match failures
+        prefixes_to_remove = [
+            'the answer is: ',
+            'answer: ',
+            'final answer: ',
+            'result: ',
+            'solution: '
+        ]
+        answer_lower = answer.lower()
+        for prefix in prefixes_to_remove:
+            if answer_lower.startswith(prefix):
+                answer = answer[len(prefix):].strip()
+                break
+        # Handle numeric answers
+        if self._is_numeric_answer(answer):
+            return self._format_numeric_answer(answer)
+        # Handle yes/no answers
+        if answer.lower() in ['yes', 'no', 'true', 'false']:
+            return answer.lower()
+        # Return cleaned text answer
+        return answer
+    def _is_numeric_answer(self, answer: str) -> bool:
+        """Check if answer is numeric"""
+        try:
+            float(answer)
+            return True
+        except ValueError:
+            return False
+    def _format_numeric_answer(self, answer: str) -> str:
+        """Format numeric answers consistently"""
+        try:
+            num = float(answer)
+            if num.is_integer():
+                return str(int(num))
+            else:
+                # Round to 6 decimal places to avoid floating point issues
+                return str(round(num, 6)).rstrip('0').rstrip('.')
+        except ValueError:
+            return answer
+class WebSearchTool:
+    """Simple web search tool (implement with your preferred search API)"""
+    def search(self, query: str, max_results: int = 3) -> List[Dict]:
+        """Perform web search - implement with your preferred search service"""
+        print(f"Web search: {query}")
+        # Placeholder implementation
+        # Replace with actual search API (DuckDuckGo, Google Custom Search, etc.)
+        return [
+            {
+                'title': f'Search result for: {query}',
+                'summary': f'Information about {query}',
+                'url': 'https://example.com'
+            }
+        ]
+class CalculatorTool:
+    """Safe calculator for mathematical expressions"""
+    def calculate(self, expression: str) -> Optional[float]:
+        """Safely evaluate mathematical expressions"""
+        try:
+            # Remove whitespace
+            expression = expression.replace(' ', '')
+            # Basic safety check
+            allowed_chars = set('0123456789+-*/().e')
+            if not all(c in allowed_chars for c in expression):
+                raise ValueError("Invalid characters in expression")
+            # Use ast for safe evaluation
+            node = ast.parse(expression, mode='eval')
+            result = self._eval_node(node.body)
+            return result
+        except Exception as e:
+            print(f"Calculation error: {e}")
+            return None
+    def _eval_node(self, node):
+        """Recursively evaluate AST node"""
+        if isinstance(node, ast.Constant):
+            return node.value
+        elif isinstance(node, ast.Num):  # Python < 3.8
+            return node.n
+        elif isinstance(node, ast.BinOp):
+            left = self._eval_node(node.left)
+            right = self._eval_node(node.right)
+            if isinstance(node.op, ast.Add):
+                return left + right
+            elif isinstance(node.op, ast.Sub):
+                return left - right
+            elif isinstance(node.op, ast.Mult):
+                return left * right
+            elif isinstance(node.op, ast.Div):
+                return left / right
+            elif isinstance(node.op, ast.Pow):
+                return left ** right
+        elif isinstance(node, ast.UnaryOp):
+            operand = self._eval_node(node.operand)
+            if isinstance(node.op, ast.USub):
+                return -operand
+            elif isinstance(node.op, ast.UAdd):
+                return +operand
+        raise ValueError(f"Unsupported operation: {type(node)}")
+class FileProcessorTool:
+    """Tool for processing files from GAIA tasks"""
+    def __init__(self, api_base_url: str):
+        self.api_base_url = api_base_url
+    def process_file(self, task_id: str, file_name: str) -> Dict:
+        """Process file associated with a task"""
+        try:
+            # Download file
+            file_content = self._download_file(task_id)
+            # Process based on file extension
+            if file_name.endswith('.csv'):
+                return self._process_csv(file_content)
+            elif file_name.endswith('.txt'):
+                return self._process_text(file_content)
+            elif file_name.endswith('.json'):
+                return self._process_json(file_content)
+            else:
+                return self._process_generic(file_content)
+        except Exception as e:
+            print(f"File processing error: {e}")
+            return {'error': str(e)}
+    def _download_file(self, task_id: str) -> bytes:
+        """Download file from API"""
+        response = requests.get(f"{self.api_base_url}/files/{task_id}")
+        response.raise_for_status()
+        return response.content
+    def _process_csv(self, content: bytes) -> Dict:
+        """Process CSV file"""
+        try:
+            import io
+            import csv
+            # Convert bytes to string
+            text_content = content.decode('utf-8')
+            # Parse CSV
+            reader = csv.reader(io.StringIO(text_content))
+            rows = list(reader)
+            if not rows:
+                return {'error': 'Empty CSV file'}
+            headers = rows[0] if rows else []
+            data_rows = rows[1:] if len(rows) > 1 else []
+            # Extract numbers for potential calculations
+            numbers = []
+            for row in data_rows:
+                for cell in row:
+                    try:
+                        numbers.append(float(cell))
+                    except ValueError:
+                        continue
+            return {
+                'type': 'csv',
+                'headers': headers,
+                'rows': data_rows,
+                'row_count': len(data_rows),
+                'numbers': numbers,
+                'summary': f'CSV with {len(headers)} columns and {len(data_rows)} rows'
+            }
+        except Exception as e:
+            return {'error': f'CSV processing failed: {e}'}
+    def _process_text(self, content: bytes) -> Dict:
+        """Process text file"""
+        try:
+            text = content.decode('utf-8')
+            # Extract numbers from text
+            numbers = [float(match) for match in re.findall(r'\d+\.?\d*', text)]
+            # Basic text analysis
+            lines = text.split('\n')
+            words = text.split()
+            return {
+                'type': 'text',
+                'content': text,
+                'line_count': len(lines),
+                'word_count': len(words),
+                'numbers': numbers,
+                'summary': f'Text file with {len(lines)} lines and {len(words)} words'
+            }
+        except Exception as e:
+            return {'error': f'Text processing failed: {e}'}
+    def _process_json(self, content: bytes) -> Dict:
+        """Process JSON file"""
+        try:
+            data = json.loads(content.decode('utf-8'))
+            # Extract numbers from JSON structure
+            numbers = self._extract_numbers_from_json(data)
+            return {
+                'type': 'json',
+                'data': data,
+                'numbers': numbers,
+                'summary': f'JSON file with {len(data) if isinstance(data, (list, dict)) else 1} items'
+            }
+        except Exception as e:
+            return {'error': f'JSON processing failed: {e}'}
+    def _process_generic(self, content: bytes) -> Dict:
+        """Process generic file"""
+        try:
+            # Try to decode as text first
+            try:
+                text = content.decode('utf-8')
+                return self._process_text(content)
+            except UnicodeDecodeError:
+                # Binary file
+                return {
+                    'type': 'binary',
+                    'size': len(content),
+                    'summary': f'Binary file of {len(content)} bytes'
+                }
+        except Exception as e:
+            return {'error': f'Generic processing failed: {e}'}
+    def _extract_numbers_from_json(self, data, numbers=None):
+        """Recursively extract numbers from JSON structure"""
+        if numbers is None:
+            numbers = []
+        if isinstance(data, (int, float)):
+            numbers.append(float(data))
+        elif isinstance(data, dict):
+            for value in data.values():
+                self._extract_numbers_from_json(value, numbers)
+        elif isinstance(data, list):
+            for item in data:
+                self._extract_numbers_from_json(item, numbers)
+        return numbers
+class TextAnalyzerTool:
+    """Tool for analyzing and extracting information from text"""
+    def analyze(self, text: str, context: Dict = None) -> str:
+        """Analyze text and extract relevant information"""
+        try:
+            # Basic keyword extraction
+            keywords = self._extract_keywords(text)
+            # Look for specific patterns based on question type
+            if any(word in text.lower() for word in ['who', 'what', 'when', 'where']):
+                return self._analyze_question_pattern(text, context)
+            # Look for calculations
+            if any(word in text.lower() for word in ['calculate', 'sum', 'total', 'average']):
+                return self._analyze_calculation_pattern(text, context)
+            # Default analysis
+            return f"Analysis of text with keywords: {', '.join(keywords[:5])}"
+        except Exception as e:
+            return f"Analysis failed: {e}"
+    def _extract_keywords(self, text: str) -> List[str]:
+        """Extract important keywords from text"""
+        # Simple keyword extraction
+        words = re.findall(r'\b[A-Za-z]{3,}\b', text.lower())
+        # Remove common stop words
+        stop_words = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'man', 'new', 'now', 'old', 'see', 'two', 'way', 'who', 'boy', 'did', 'its', 'let', 'put', 'say', 'she', 'too', 'use'}
+        keywords = [word for word in words if word not in stop_words]
+        # Return most frequent keywords
+        from collections import Counter
+        return [word for word, count in Counter(keywords).most_common(10)]
+    def _analyze_question_pattern(self, text: str, context: Dict) -> str:
+        """Analyze question patterns to extract answers"""
+        # This is where you'd implement more sophisticated NLP
+        # For now, return a simple analysis
+        if context and 'search_data' in context:
+            search_results = context['search_data']
+            if search_results and isinstance(search_results, list) and len(search_results) > 0:
+                return search_results[0].get('summary', 'No summary available')
+        return "Unable to extract specific answer from question pattern"
+    def _analyze_calculation_pattern(self, text: str, context: Dict) -> str:
+        """Analyze calculation patterns"""
+        if context and 'calculation' in context:
+            return str(context['calculation'])
+        # Extract numbers for potential calculation
+        numbers = re.findall(r'\d+\.?\d*', text)
+        if numbers:
+            return f"Found numbers: {', '.join(numbers)}"
+        return "No calculation pattern found"
+# Main execution functions
+def test_agent_on_random_question(api_base_url: str):
+    """Test the agent on a random question"""
+    agent = GAIAAgent(api_base_url)
+    try:
+        # Get random question
+        response = requests.get(f"{api_base_url}/random-question")
+        question = response.json()
+        print("=" * 50)
+        print("TESTING RANDOM QUESTION")
+        print("=" * 50)
+        print(f"Task ID: {question.get('task_id')}")
+        print(f"Question: {question.get('question')}")
+        print(f"File: {question.get('file_name', 'None')}")
+        print("-" * 50)
+        # Solve question
+        start_time = time.time()
+        answer = agent.solve_question(question)
+        end_time = time.time()
+        print(f"Agent Answer: {answer}")
+        print(f"Processing Time: {end_time - start_time:.2f} seconds")
+        print("=" * 50)
+        return {
+            'task_id': question.get('task_id'),
+            'question': question.get('question'),
+            'agent_answer': answer,
+            'processing_time': end_time - start_time
+        }
+    except Exception as e:
+        print(f"Error testing random question: {e}")
+        return None
+def run_full_evaluation(api_base_url: str, username: str, agent_code_url: str):
+    """Run the complete evaluation on all 20 questions"""
+    agent = GAIAAgent(api_base_url)
+    try:
+        # Get all questions
+        response = requests.get(f"{api_base_url}/questions")
+        questions = response.json()
+        print(f"Starting evaluation on {len(questions)} questions...")
+        answers = []
+        successful_answers = 0
+        for i, question in enumerate(questions):
+            print(f"\n{'='*60}")
+            print(f"PROCESSING QUESTION {i+1}/{len(questions)}")
+            print(f"{'='*60}")
+            print(f"Task ID: {question.get('task_id')}")
+            print(f"Question: {question.get('question')[:100]}...")
+            try:
+                start_time = time.time()
+                answer = agent.solve_question(question)
+                end_time = time.time()
+                answers.append({
+                    'task_id': question['task_id'],
+                    'submitted_answer': answer
+                })
+                print(f"Answer: {answer}")
+                print(f"Time: {end_time - start_time:.2f}s")
+                if answer and answer != "Unable to determine answer":
+                    successful_answers += 1
+            except Exception as e:
+                print(f"Error processing question {i+1}: {e}")
+                answers.append({
+                    'task_id': question['task_id'],
+                    'submitted_answer': "Processing error"
+                })
+        print(f"\n{'='*60}")
+        print(f"EVALUATION COMPLETE")
+        print(f"{'='*60}")
+        print(f"Successfully processed: {successful_answers}/{len(questions)} questions")
+        print(f"Success rate: {(successful_answers/len(questions)*100):.1f}%")
+        # Submit results
+        print(f"\nSubmitting results...")
+        submission_result = submit_results(api_base_url, username, agent_code_url, answers)
+        return {
+            'answers': answers,
+            'successful_answers': successful_answers,
+            'total_questions': len(questions),
+            'submission_result': submission_result
+        }
+    except Exception as e:
+        print(f"Error in full evaluation: {e}")
+        return None
+def submit_results(api_base_url: str, username: str, agent_code_url: str, answers: List[Dict]):
+    """Submit results to the leaderboard"""
+    try:
+        submission_data = {
+            'username': username,
+            'agent_code': agent_code_url,
+            'answers': answers
+        }
+        response = requests.post(f"{api_base_url}/submit", json=submission_data)
+        if response.status_code == 200:
+            result = response.json()
+            print(f"✅ Submission successful!")
+            print(f"Score: {result.get('score', 'N/A')}%")
+            print(f"Rank: {result.get('rank', 'N/A')}")
+            return result
+        else:
+            print(f"❌ Submission failed: {response.status_code}")
+            print(f"Response: {response.text}")
+            return None
+    except Exception as e:
+        print(f"Error submitting results: {e}")
+        return None
+# Example usage and testing functions
+if __name__ == "__main__":
+    # Configuration - Replace with actual values
+    API_BASE_URL = "https://your-api-endpoint.com"  # Replace with actual API URL
+    USERNAME = "your-huggingface-username"  # Replace with your username
+    AGENT_CODE_URL = "https://huggingface.co/spaces/your-username/gaia-agent/tree/main"  # Replace with your space URL
+    print("GAIA Agent Implementation")
+    print("=" * 40)
+    # Test on a few random questions first
+    print("1. Testing on random questions...")
+    for i in range(3):
+        print(f"\n--- Random Test {i+1} ---")
+        test_result = test_agent_on_random_question(API_BASE_URL)
+        if test_result:
+            print(f"✅ Test {i+1} completed")
+        else:
+            print(f"❌ Test {i+1} failed")
+    # Ask user if they want to run full evaluation
+    user_input = input("\nRun full evaluation on all 20 questions? (y/n): ")
+    if user_input.lower() == 'y':
+        print("\n" + "=" * 60)
+        print("STARTING FULL EVALUATION")
+        print("=" * 60)
+        evaluation_result = run_full_evaluation(API_BASE_URL, USERNAME, AGENT_CODE_URL)
+        if evaluation_result:
+            print(f"\n🎉 Evaluation completed!")
+            print(f"Final score: {evaluation_result.get('submission_result', {}).get('score', 'N/A')}%")
+            if evaluation_result.get('submission_result', {}).get('score', 0) >= 30:
+                print(f"🏆 CONGRATULATIONS! You've achieved the 30% threshold!")
+                print(f"🎓 You've earned your Certificate of Completion!")
+            else:
+                print(f"📈 Keep improving! You need 30% to earn the certificate.")
+        else:
+            print(f"❌ Evaluation failed. Please check your implementation.")
+    else:
+        print("Evaluation cancelled. Use the test functions to debug your agent first.")
+# Additional utility functions for development and debugging
+def debug_question_analysis(api_base_url: str, task_id: str = None):
+    """Debug question analysis for a specific question"""
+    agent = GAIAAgent(api_base_url)
+    if task_id:
+        # Get specific question (you'd need to implement this endpoint or find the question in the list)
+        response = requests.get(f"{api_base_url}/questions")
+        questions = response.json()
+        question = next((q for q in questions if q.get('task_id') == task_id), None)
+    else:
+        # Get random question
+        response = requests.get(f"{api_base_url}/random-question")
+        question = response.json()
+    if not question:
+        print("Question not found")
+        return
+    print("QUESTION ANALYSIS DEBUG")
+    print("=" * 40)
+    print(f"Task ID: {question.get('task_id')}")
+    print(f"Question: {question.get('question')}")
+    print(f"File: {question.get('file_name', 'None')}")
+    print("-" * 40)
+    # Analyze question
+    analysis = agent._analyze_question(question)
+    print("Analysis Results:")
+    for key, value in analysis.items():
+        print(f"  {key}: {value}")
+    # Create plan
+    plan = agent._create_execution_plan(analysis, question)
+    print(f"\nExecution Plan:")
+    for i, step in enumerate(plan):
+        print(f"  {i+1}. {step['action']} (priority: {step['priority']})")
+    return question, analysis, plan
+def benchmark_agent_performance(api_base_url: str, num_tests: int = 10):
+    """Benchmark agent performance on multiple random questions"""
+    agent = GAIAAgent(api_base_url)
+    results = []
+    total_time = 0
+    successful_answers = 0
+    print(f"BENCHMARKING AGENT ({num_tests} questions)")
+    print("=" * 50)
+    for i in range(num_tests):
+        try:
+            response = requests.get(f"{api_base_url}/random-question")
+            question = response.json()
+            start_time = time.time()
+            answer = agent.solve_question(question)
+            end_time = time.time()
+            processing_time = end_time - start_time
+            total_time += processing_time
+            if answer and answer != "Unable to determine answer":
+                successful_answers += 1
+                status = "✅"
+            else:
+                status = "❌"
+            print(f"{status} Question {i+1}: {processing_time:.2f}s - {answer[:50]}...")
+            results.append({
+                'question_id': i+1,
+                'task_id': question.get('task_id'),
+                'answer': answer,
+                'processing_time': processing_time,
+                'success': answer != "Unable to determine answer"
+            })
+        except Exception as e:
+            print(f"❌ Question {i+1}: Error - {e}")
+            results.append({
+                'question_id': i+1,
+                'error': str(e),
+                'success': False
+            })
+    # Print summary
+    print("\n" + "=" * 50)
+    print("BENCHMARK RESULTS")
+    print("=" * 50)
+    print(f"Successful answers: {successful_answers}/{num_tests} ({successful_answers/num_tests*100:.1f}%)")
+    print(f"Average processing time: {total_time/num_tests:.2f}s")
+    print(f"Total time: {total_time:.2f}s")
+    return results