FinalTest

Runtime error

App Files Files Community

yoshizen commited on May 28

Commit

b763a9b

verified ·

1 Parent(s): 985047d

Update gaia_agent.py

Browse files

Files changed (1) hide show

gaia_agent.py +180 -701

gaia_agent.py CHANGED Viewed

@@ -1,787 +1,266 @@
 """
-Enhanced GAIA Agent with Strict Output Formatting and Answer Logging for Hugging Face Course
 """
 import os
-import re
-import math
-import json
-import datetime
 import requests
-from typing import List, Dict, Any, Optional, Union, Tuple, Callable
-import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
-class EnhancedGAIAAgent:
     """
-    An enhanced agent designed to pass the GAIA evaluation by combining rule-based precision
-    with LLM-powered flexibility and strict output formatting.
     """
-    def __init__(self, model_name="google/flan-t5-large", device=None):
-        """Initialize the agent with tools and model."""
-        self.model_name = model_name
-        print(f"EnhancedGAIAAgent initializing with model: {model_name}")
-        # Initialize LLM components
-        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
-        self._initialize_llm()
-        # Register specialized handlers
-        self.handlers = {
-            'calculation': self._handle_calculation,
-            'date_time': self._handle_date_time,
-            'list': self._handle_list_question,
-            'visual': self._handle_visual_question,
-            'factual': self._handle_factual_question,
-            'general': self._handle_general_question
-        }
-        # Define prompt templates
-        self.prompt_templates = {
-            'calculation': "Solve this step by step: {question}",
-            'date_time': "Answer this date/time question precisely: {question}",
-            'list': "Provide a comma-separated list for: {question}",
-            'visual': "Describe what is shown in the image related to: {question}",
-            'factual': "Answer this question concisely: {question}",
-            'reasoning': "Let's think step by step: {question}",
-            'general': "Provide a specific, concise answer: {question}"
-        }
-        print("EnhancedGAIAAgent initialized successfully")
-    def _initialize_llm(self):
-        """Initialize the language model for fallback responses."""
         try:
-            print(f"Loading model {self.model_name} on {self.device}")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(self.device)
-            self.llm_available = True
-            print("LLM initialized successfully")
         except Exception as e:
-            print(f"Error initializing LLM: {e}")
-            self.llm_available = False
-            self.tokenizer = None
             self.model = None
-    def __call__(self, question: str, task_id: str = None) -> str:
-        """
-        Process a question and return a formatted answer according to GAIA benchmark requirements.
-        Args:
-            question: The question to answer
-            task_id: Optional task ID for the GAIA benchmark
-        Returns:
-            JSON string with the required GAIA format
-        """
-        print(f"Processing question: {question}")
-        # Determine question type
-        question_type = self._classify_question(question)
-        print(f"Classified as: {question_type}")
-        # Generate reasoning trace if appropriate
-        reasoning_trace = self._generate_reasoning_trace(question, question_type)
-        # Use the appropriate handler to get the answer
-        model_answer = self.handlers[question_type](question)
-        # Ensure answer is concise and specific
-        model_answer = self._ensure_concise_answer(model_answer, question_type)
-        # Format the response according to GAIA requirements
-        response = {
-            "task_id": task_id if task_id else "unknown_task",
-            "model_answer": model_answer,
-            "reasoning_trace": reasoning_trace
-        }
-        # Return the formatted JSON response
-        return json.dumps(response, ensure_ascii=False)
-    def _generate_reasoning_trace(self, question: str, question_type: str) -> str:
-        """Generate a reasoning trace for the question if appropriate."""
-        # For calculation and reasoning questions, provide a trace
-        if question_type == 'calculation':
-            # Extract numbers and operation from the question
-            numbers = re.findall(r'\d+', question)
-            if len(numbers) >= 2:
-                if re.search(r'(sum|add|plus|\+)', question.lower()):
-                    return f"To find the sum, I add the numbers: {' + '.join(numbers)} = {sum(int(num) for num in numbers)}"
-                elif re.search(r'(difference|subtract|minus|\-)', question.lower()) and len(numbers) >= 2:
-                    return f"To find the difference, I subtract: {numbers[0]} - {numbers[1]} = {int(numbers[0]) - int(numbers[1])}"
-                elif re.search(r'(product|multiply|times|\*)', question.lower()) and len(numbers) >= 2:
-                    return f"To find the product, I multiply: {numbers[0]} × {numbers[1]} = {int(numbers[0]) * int(numbers[1])}"
-                elif re.search(r'(divide|division|\/)', question.lower()) and len(numbers) >= 2:
-                    if int(numbers[1]) != 0:
-                        return f"To find the quotient, I divide: {numbers[0]} ÷ {numbers[1]} = {int(numbers[0]) / int(numbers[1])}"
-            # If we can't generate a specific trace, use a generic one
-            return "I need to identify the numbers and operations in the question, then perform the calculation step by step."
-        elif question_type in ['factual', 'general'] and self.llm_available:
-            # For factual and general questions, use LLM to generate a trace
-            try:
-                prompt = f"Explain your reasoning for answering this question: {question}"
-                inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
-                outputs = self.model.generate(
-                    inputs["input_ids"],
-                    max_length=150,
-                    min_length=20,
-                    temperature=0.3,
-                    top_p=0.95,
-                    do_sample=True,
-                    num_return_sequences=1
-                )
-                trace = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                return trace[:200]  # Limit trace length
-            except:
-                pass
-        # For other question types or if LLM fails, provide a minimal trace
-        return ""
-    def _classify_question(self, question: str) -> str:
-        """Determine the type of question for specialized handling."""
-        question_lower = question.lower()
-        # Check for calculation questions
-        if self._is_calculation_question(question):
-            return 'calculation'
-        # Check for date/time questions
-        elif self._is_date_time_question(question):
-            return 'date_time'
-        # Check for list questions
-        elif self._is_list_question(question):
-            return 'list'
-        # Check for visual/image questions
-        elif self._is_visual_question(question):
-            return 'visual'
-        # Check for factual questions
-        elif self._is_factual_question(question):
-            return 'factual'
-        # Default to general knowledge
-        else:
-            return 'general'
-    def _is_calculation_question(self, question: str) -> bool:
-        """Check if the question requires mathematical calculation."""
-        calculation_patterns = [
-            r'\d+\s*[\+\-\*\/]\s*\d+',  # Basic operations: 5+3, 10-2, etc.
-            r'(sum|add|plus|subtract|minus|multiply|divide|product|quotient)',
-            r'(calculate|compute|find|what is|how much|result)',
-            r'(square root|power|exponent|factorial|percentage|average|mean)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in calculation_patterns)
-    def _is_date_time_question(self, question: str) -> bool:
-        """Check if the question is about date or time."""
-        date_time_patterns = [
-            r'(date|time|day|month|year|hour|minute|second)',
-            r'(today|tomorrow|yesterday|current|now)',
-            r'(calendar|schedule|appointment)',
-            r'(when|how long|duration|period)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in date_time_patterns)
-    def _is_list_question(self, question: str) -> bool:
-        """Check if the question requires a list as an answer."""
-        list_patterns = [
-            r'(list|enumerate|items|elements)',
-            r'comma.separated',
-            r'(all|every|each).*(of|in)',
-            r'(provide|give).*(list)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in list_patterns)
-    def _is_visual_question(self, question: str) -> bool:
-        """Check if the question is about an image or visual content."""
-        visual_patterns = [
-            r'(image|picture|photo|graph|chart|diagram|figure)',
-            r'(show|display|illustrate|depict)',
-            r'(look|see|observe|view)',
-            r'(visual|visually)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in visual_patterns)
-    def _is_factual_question(self, question: str) -> bool:
-        """Check if the question is asking for a factual answer."""
-        factual_patterns = [
-            r'^(who|what|where|when|why|how)',
-            r'(name|identify|specify|tell me)',
-            r'(capital|president|inventor|author|creator|founder)',
-            r'(located|situated|found|discovered)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in factual_patterns)
-    def _handle_calculation(self, question: str) -> str:
-        """Handle mathematical calculation questions with precise answers."""
-        # Extract numbers and operation from the question
-        numbers = re.findall(r'\d+', question)
-        # Try to extract a mathematical expression
-        expression_match = re.search(r'\d+\s*[\+\-\*\/]\s*\d+', question)
-        # Determine the operation
-        if re.search(r'(sum|add|plus|\+)', question.lower()) and len(numbers) >= 2:
-            result = sum(int(num) for num in numbers)
-            return str(result)
-        elif re.search(r'(difference|subtract|minus|\-)', question.lower()) and len(numbers) >= 2:
-            result = int(numbers[0]) - int(numbers[1])
-            return str(result)
-        elif re.search(r'(product|multiply|times|\*)', question.lower()) and len(numbers) >= 2:
-            result = int(numbers[0]) * int(numbers[1])
-            return str(result)
-        elif re.search(r'(divide|division|\/)', question.lower()) and len(numbers) >= 2 and int(numbers[1]) != 0:
-            result = int(numbers[0]) / int(numbers[1])
-            return str(result)
-        # For more complex calculations, try to evaluate the expression
-        elif expression_match:
-            try:
-                # Extract and clean the expression
-                expr = expression_match.group(0)
-                expr = expr.replace('plus', '+').replace('minus', '-')
-                expr = expr.replace('times', '*').replace('divided by', '/')
-                # Evaluate the expression
-                result = eval(expr)
-                return str(result)
-            except:
-                pass
-        # If rule-based approach fails, use LLM with math-specific prompt
-        return self._generate_llm_response(question, 'calculation')
-    def _handle_date_time(self, question: str) -> str:
-        """Handle date and time related questions."""
-        now = datetime.datetime.now()
-        question_lower = question.lower()
-        if re.search(r'(today|current date|what day is it)', question_lower):
-            return now.strftime("%Y-%m-%d")
-        elif re.search(r'(time now|current time|what time is it)', question_lower):
-            return now.strftime("%H:%M:%S")
-        elif re.search(r'(day of the week|what day of the week)', question_lower):
-            return now.strftime("%A")
-        elif re.search(r'(month|current month|what month is it)', question_lower):
-            return now.strftime("%B")
-        elif re.search(r'(year|current year|what year is it)', question_lower):
-            return now.strftime("%Y")
-        # For more complex date/time questions, use LLM
-        return self._generate_llm_response(question, 'date_time')
-    def _handle_list_question(self, question: str) -> str:
-        """Handle questions requiring a list as an answer."""
-        question_lower = question.lower()
-        # Common list questions with specific answers
-        if re.search(r'(fruit|fruits)', question_lower):
-            return "apple, banana, orange, grape, strawberry"
-        elif re.search(r'(vegetable|vegetables)', question_lower):
-            return "carrot, broccoli, spinach, potato, onion"
-        elif re.search(r'(country|countries)', question_lower):
-            return "USA, China, India, Russia, Brazil"
-        elif re.search(r'(capital|capitals)', question_lower):
-            return "Washington D.C., Beijing, New Delhi, Moscow, Brasilia"
-        elif re.search(r'(planet|planets)', question_lower):
-            return "Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune"
-        # For other list questions, use LLM with list-specific prompt
-        return self._generate_llm_response(question, 'list')
-    def _handle_visual_question(self, question: str) -> str:
-        """Handle questions about images or visual content."""
-        # Extract key terms from the question to customize the response
-        key_terms = re.findall(r'[a-zA-Z]{4,}', question)
-        key_term = key_terms[0].lower() if key_terms else "content"
-        # Create a contextually relevant placeholder response
-        if "graph" in question.lower() or "chart" in question.lower():
-            return f"The {key_term} graph shows an upward trend with significant data points highlighting the key metrics relevant to your question."
-        elif "diagram" in question.lower():
-            return f"The diagram illustrates the structure and components of the {key_term}, showing how the different parts interact with each other."
-        elif "map" in question.lower():
-            return f"The map displays the geographical distribution of {key_term}, with notable concentrations in the regions most relevant to your question."
-        # Default visual response
-        return f"The image shows {key_term} with distinctive features that directly address your question. The visual elements clearly indicate the answer based on the context provided."
-    def _handle_factual_question(self, question: str) -> str:
-        """Handle factual questions with specific answers."""
-        question_lower = question.lower()
-        # Common factual questions with specific answers
-        if re.search(r'(capital of france|paris is the capital of)', question_lower):
-            return "Paris"
-        elif re.search(r'(first president of (the United States|USA|US))', question_lower):
-            return "George Washington"
-        elif re.search(r'(invented (the telephone|telephone))', question_lower):
-            return "Alexander Graham Bell"
-        elif re.search(r'(wrote (hamlet|romeo and juliet))', question_lower):
-            return "William Shakespeare"
-        elif re.search(r'(tallest mountain|highest mountain)', question_lower):
-            return "Mount Everest"
-        elif re.search(r'(largest ocean|biggest ocean)', question_lower):
-            return "Pacific Ocean"
-        # For other factual questions, use LLM with factual-specific prompt
-        return self._generate_llm_response(question, 'factual')
-    def _handle_general_question(self, question: str) -> str:
-        """Handle general knowledge questions that don't fit other categories."""
-        # For general questions, use LLM with general or reasoning prompt
-        if re.search(r'(why|how|explain|reason)', question.lower()):
-            return self._generate_llm_response(question, 'reasoning')
-        else:
-            return self._generate_llm_response(question, 'general')
-    def _generate_llm_response(self, question: str, prompt_type: str) -> str:
-        """Generate a response using the language model with appropriate prompt template."""
-        if not self.llm_available:
-            return self._fallback_response(question, prompt_type)
         try:
-            # Get the appropriate prompt template
-            template = self.prompt_templates.get(prompt_type, self.prompt_templates['general'])
-            prompt = template.format(question=question)
-            # Generate response using the model
-            inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
             outputs = self.model.generate(
                 inputs["input_ids"],
-                max_length=100,  # Shorter to ensure concise answers
-                min_length=5,
-                temperature=0.3,  # Lower temperature for more focused answers
-                top_p=0.95,
                 do_sample=True,
                 num_return_sequences=1
             )
-            # Decode the response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Clean up the response
-            response = self._clean_llm_response(response)
             return response
         except Exception as e:
-            print(f"Error generating LLM response: {e}")
-            return self._fallback_response(question, prompt_type)
-    def _clean_llm_response(self, response: str) -> str:
-        """Clean up the LLM's response to ensure it's concise and specific."""
-        # Remove any prefixes like "Answer:" or "Response:"
-        prefixes = ["Answer:", "Response:", "A:", "The answer is:", "I think", "I believe"]
         for prefix in prefixes:
             if response.lower().startswith(prefix.lower()):
                 response = response[len(prefix):].strip()
-        # Remove hedging language
-        hedges = ["I think", "I believe", "In my opinion", "It seems", "It appears", "Perhaps", "Maybe"]
-        for hedge in hedges:
-            if response.lower().startswith(hedge.lower()):
-                response = response[len(hedge):].strip()
-        # Remove trailing explanations after periods if the response is long
-        if len(response) > 50 and "." in response[30:]:
-            first_period = response.find(".", 30)
-            if first_period > 0:
-                response = response[:first_period + 1]
         return response.strip()
-    def _fallback_response(self, question: str, question_type: str) -> str:
-        """Provide a fallback response if LLM generation fails."""
-        question_lower = question.lower()
-        # Tailored fallbacks based on question type
-        if question_type == 'calculation':
-            return "42"  # Universal answer
-        elif question_type == 'date_time':
-            now = datetime.datetime.now()
-            return now.strftime("%Y-%m-%d")
-        elif question_type == 'list':
-            return "item1, item2, item3, item4, item5"
-        elif question_type == 'visual':
-            return "The image shows the key elements that directly answer your question based on visual evidence."
-        elif question_type == 'factual':
-            if "who" in question_lower:
-                return "Albert Einstein"
-            elif "where" in question_lower:
-                return "London"
-            elif "when" in question_lower:
-                return "1969"
-            elif "why" in question_lower:
-                return "due to economic and technological factors"
-            elif "how" in question_lower:
-                return "through a series of chemical reactions"
-            elif "what" in question_lower:
-                return "a fundamental concept in the field"
-        # General fallback
-        return "The answer involves multiple factors that must be considered in context."
-    def _ensure_concise_answer(self, answer: str, question_type: str) -> str:
-        """Ensure the answer is concise and specific."""
-        # If answer is too short, it might be too vague
-        if len(answer) < 3:
-            return self._fallback_response("", question_type)
-        # If answer is too long, truncate it
-        if len(answer) > 200:
-            # Try to find a good truncation point
-            truncation_points = ['. ', '? ', '! ', '; ']
-            for point in truncation_points:
-                last_point = answer[:200].rfind(point)
-                if last_point > 30:  # Ensure we have a meaningful answer
-                    return answer[:last_point + 1].strip()
-            # If no good truncation point, just cut at 200 chars
-            return answer[:200].strip()
-        return answer
 class EvaluationRunner:
     """
-    Handles the evaluation process: fetching questions, running the agent,
-    and submitting answers to the evaluation server.
     """
-    def __init__(self, api_url: str = "https://agents-course-unit4-scoring.hf.space"):
-        """Initialize with API endpoints."""
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
-        self.results_url = f"{api_url}/results"
-        # Initialize counters for tracking correct answers
-        self.total_questions = 0
-        self.correct_answers = 0
-        self.ground_truth = {}  # Store ground truth answers if available
     def run_evaluation(self,
-                      agent: Any,
                       username: str,
-                      agent_code_url: str) -> tuple[str, Any]:
-        """
-        Run the full evaluation process:
-        1. Fetch questions
-        2. Run agent on all questions
-        3. Submit answers
-        4. Check results and count correct answers
-        5. Return results
-        """
-        # Reset counters
-        self.total_questions = 0
-        self.correct_answers = 0
-        # Fetch questions
         questions_data = self._fetch_questions()
-        if isinstance(questions_data, str):  # Error message
             return questions_data, None
-        # Run agent on all questions
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
-            return "Agent did not produce any answers to submit.", results_log
-        # Submit answers
-        submission_result = self._submit_answers(username, agent_code_url, answers_payload)
-        # Try to fetch results to count correct answers
-        self._check_results(username)
-        # Return results with correct answer count
-        return submission_result, results_log
     def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
-        """Fetch questions from the evaluation server."""
-        print(f"Fetching questions from: {self.questions_url}")
         try:
             response = requests.get(self.questions_url, timeout=15)
             response.raise_for_status()
             questions_data = response.json()
             if not questions_data:
-                error_msg = "Fetched questions list is empty or invalid format."
-                print(error_msg)
-                return error_msg
-            self.total_questions = len(questions_data)
-            print(f"Successfully fetched {self.total_questions} questions.")
             return questions_data
-        except requests.exceptions.RequestException as e:
-            error_msg = f"Error fetching questions: {e}"
-            print(error_msg)
-            return error_msg
-        except requests.exceptions.JSONDecodeError as e:
-            error_msg = f"Error decoding JSON response from questions endpoint: {e}"
-            print(error_msg)
-            print(f"Response text: {response.text[:500]}")
-            return error_msg
         except Exception as e:
-            error_msg = f"An unexpected error occurred fetching questions: {e}"
-            print(error_msg)
-            return error_msg
     def _run_agent_on_questions(self,
-                               agent: Any,
                                questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-        """Run the agent on all questions and collect results."""
         results_log = []
         answers_payload = []
-        print(f"Running agent on {len(questions_data)} questions...")
         for item in questions_data:
             task_id = item.get("task_id")
             question_text = item.get("question")
             if not task_id or question_text is None:
-                print(f"Skipping item with missing task_id or question: {item}")
                 continue
             try:
-                # Call agent with task_id to ensure proper formatting
-                json_response = agent(question_text, task_id)
-                # Parse the JSON response
-                response_obj = json.loads(json_response)
-                # Extract the model_answer for submission
-                submitted_answer = response_obj.get("model_answer", "")
-                answers_payload.append({
-                    "task_id": task_id,
-                    "submitted_answer": submitted_answer
-                })
-                results_log.append({
-                    "Task ID": task_id,
-                    "Question": question_text,
-                    "Submitted Answer": submitted_answer,
-                    "Full Response": json_response
-                })
             except Exception as e:
-                print(f"Error running agent on task {task_id}: {e}")
-                results_log.append({
-                    "Task ID": task_id,
-                    "Question": question_text,
-                    "Submitted Answer": f"AGENT ERROR: {e}"
-                })
         return results_log, answers_payload
-    def _submit_answers(self,
-                       username: str,
-                       agent_code_url: str,
-                       answers_payload: List[Dict[str, Any]]) -> str:
-        """Submit answers to the evaluation server."""
         submission_data = {
             "username": username.strip(),
-            "agent_code_url": agent_code_url.strip(),
             "answers": answers_payload
         }
-        print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}")
-        max_retries = 3
-        retry_delay = 5  # seconds
-        for attempt in range(1, max_retries + 1):
             try:
-                print(f"Submission attempt {attempt} of {max_retries}...")
-                response = requests.post(
-                    self.submit_url,
-                    json=submission_data,
-                    headers={"Content-Type": "application/json"},
-                    timeout=30
-                )
                 response.raise_for_status()
-                try:
-                    result = response.json()
-                    score = result.get("score")
-                    max_score = result.get("max_score")
-                    if score is not None and max_score is not None:
-                        self.correct_answers = score  # Update correct answers count
-                        return f"Evaluation complete! Score: {score}/{max_score}"
-                    else:
-                        print(f"Received N/A results. Waiting {retry_delay} seconds before retry...")
-                        time.sleep(retry_delay)
-                        continue
-                except requests.exceptions.JSONDecodeError:
-                    print(f"Submission attempt {attempt}: Response was not JSON. Response: {response.text}")
-                    if attempt < max_retries:
-                        print(f"Waiting {retry_delay} seconds before retry...")
-                        time.sleep(retry_delay)
-                    else:
-                        return f"Submission successful, but response was not JSON. Response: {response.text}"
-            except requests.exceptions.RequestException as e:
-                print(f"Submission attempt {attempt} failed: {e}")
-                if attempt < max_retries:
-                    print(f"Waiting {retry_delay} seconds before retry...")
-                    time.sleep(retry_delay)
                 else:
-                    return f"Error submitting answers after {max_retries} attempts: {e}"
-        # If we get here, all retries failed but didn't raise exceptions
-        return "Submission Successful, but results are pending!"
-    def _check_results(self, username: str) -> None:
-        """Check results to count correct answers."""
-        try:
-            results_url = f"{self.results_url}?username={username}"
-            print(f"Checking results at: {results_url}")
-            response = requests.get(results_url, timeout=15)
-            if response.status_code == 200:
-                try:
-                    data = response.json()
-                    if isinstance(data, dict):
-                        score = data.get("score")
-                        if score is not None:
-                            self.correct_answers = int(score)
-                            print(f"✓ Correct answers: {self.correct_answers}/{self.total_questions}")
-                        else:
-                            print("Score information not available in results")
-                    else:
-                        print("Results data is not in expected format")
-                except:
-                    print("Could not parse results JSON")
-            else:
-                print(f"Could not fetch results, status code: {response.status_code}")
-        except Exception as e:
-            print(f"Error checking results: {e}")
-    def get_correct_answers_count(self) -> int:
-        """Get the number of correct answers."""
-        return self.correct_answers
-    def get_total_questions_count(self) -> int:
-        """Get the total number of questions."""
-        return self.total_questions
-    def print_evaluation_summary(self, username: str) -> None:
-        """Print a summary of the evaluation results."""
-        print("\n===== EVALUATION SUMMARY =====")
-        print(f"User: {username}")
-        print(f"Overall Score: {self.correct_answers}/{self.total_questions}")
-        print(f"Correct Answers: {self.correct_answers}")
-        print(f"Total Questions: {self.total_questions}")
-        print(f"Accuracy: {(self.correct_answers / self.total_questions * 100) if self.total_questions > 0 else 0:.1f}%")
-        print("=============================\n")
-# Example usage and test cases
 def test_agent():
-    """Test the agent with example questions."""
-    agent = EnhancedGAIAAgent()
     test_questions = [
-        # Calculation questions
-        "What is 25 + 17?",
-        "Calculate the product of 8 and 9",
-        # Date/time questions
-        "What is today's date?",
-        "What day of the week is it?",
-        # List questions
-        "List five fruits",
-        "What are the planets in our solar system?",
-        # Visual questions
-        "What does the image show?",
-        "Describe the chart in the image",
-        # Factual questions
-        "Who was the first president of the United States?",
-        "What is the capital of France?",
-        "How does photosynthesis work?",
-        # General questions
-        "Why is the sky blue?",
-        "What are the implications of quantum mechanics?"
     ]
-    print("\n=== AGENT TEST RESULTS ===")
-    correct_count = 0
-    total_count = len(test_questions)
     for question in test_questions:
-        # Generate a mock task_id for testing
-        task_id = f"test_{hash(question) % 10000}"
-        # Get formatted JSON response
-        json_response = agent(question, task_id)
-        print(f"\nQ: {question}")
-        print(f"Response: {json_response}")
-        # Parse and print the model_answer for clarity
-        try:
-            response_obj = json.loads(json_response)
-            model_answer = response_obj.get('model_answer', '')
-            print(f"Model Answer: {model_answer}")
-            # For testing purposes, simulate correct answers
-            # In a real scenario, this would compare with ground truth
-            if len(model_answer) > 0 and not model_answer.startswith("AGENT ERROR"):
-                correct_count += 1
-        except:
-            print("Error parsing JSON response")
-    # Print test summary with correct answer count
-    print("\n===== TEST SUMMARY =====")
-    print(f"Correct Answers: {correct_count}/{total_count}")
-    print(f"Accuracy: {(correct_count / total_count * 100):.1f}%")
-    print("=======================\n")
-    return "Test completed successfully"
 if __name__ == "__main__":
     test_agent()

 """
+Улучшенный агент GAIA с интеграцией LLM для курса Hugging Face
 """
 import os
+import gradio as gr
 import requests
+import pandas as pd
+import json
+import time
+from typing import List, Dict, Any, Optional, Callable, Union
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# --- Константы ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+DEFAULT_MODEL = "google/flan-t5-small"  # Меньшая модель для быстрой загрузки
+MAX_RETRIES = 3  # Максимальное количество попыток отправки
+RETRY_DELAY = 5  # Задержка между попытками в секундах
+class LLMGAIAAgent:
     """
+    Улучшенный агент GAIA, использующий языковую модель для генерации ответов.
     """
+    def __init__(self, model_name=DEFAULT_MODEL):
+        """Инициализация агента с языковой моделью."""
+        print(f"Инициализация LLMGAIAAgent с моделью: {model_name}")
         try:
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+            self.model_name = model_name
+            print(f"Успешно загружена модель: {model_name}")
         except Exception as e:
+            print(f"Ошибка загрузки модели: {e}")
+            print("Переход к шаблонным ответам")
             self.model = None
+            self.tokenizer = None
+            self.model_name = None
+    def __call__(self, question: str) -> str:
+        """Обработка вопроса и возврат ответа с использованием языковой модели."""
+        print(f"Обработка вопроса: {question}")
+        if self.model is None or self.tokenizer is None:
+            return self._fallback_response(question)
         try:
+            prompt = self._prepare_prompt(question)
+            inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
             outputs = self.model.generate(
                 inputs["input_ids"],
+                max_length=150,
+                min_length=20,
+                temperature=0.7,
+                top_p=0.9,
                 do_sample=True,
                 num_return_sequences=1
             )
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            response = self._clean_response(response)
             return response
         except Exception as e:
+            print(f"Ошибка генерации ответа: {e}")
+            return self._fallback_response(question)
+    def _prepare_prompt(self, question: str) -> str:
+        """Подготовка подходящего запроса на основе типа вопроса."""
+        question_lower = question.lower()
+        if any(keyword in question_lower for keyword in [
+            "calculate", "compute", "sum", "difference",
+            "product", "divide", "plus", "minus", "times"
+        ]):
+            return f"Решите эту математическую задачу шаг за шагом: {question}"
+        elif any(keyword in question_lower for keyword in [
+            "image", "picture", "photo", "graph", "chart", "diagram"
+        ]):
+            return f"Опишите, что может быть изображено на картинке, связанной с этим вопросом: {question}"
+        elif any(keyword in question_lower for keyword in [
+            "who", "what", "where", "when", "why", "how"
+        ]):
+            return f"Дайте краткий и точный ответ на этот фактический вопрос: {question}"
+        else:
+            return f"Дайте краткий, информативный ответ на этот вопрос: {question}"
+    def _clean_response(self, response: str) -> str:
+        """Очистка ответа модели для получения чистого текста."""
+        prefixes = [
+            "Answer:", "Response:", "A:", "The answer is:",
+            "It is:", "I think it is:", "The result is:",
+            "Based on the image:", "In the image:",
+            "The image shows:", "From the image:"
+        ]
         for prefix in prefixes:
             if response.lower().startswith(prefix.lower()):
                 response = response[len(prefix):].strip()
+        if len(response) < 10:
+            return self._fallback_response("general")
         return response.strip()
+    def _fallback_response(self, question: str) -> str:
+        """Резервный ответ, если модель не сработала."""
+        question_lower = question.lower() if isinstance(question, str) else ""
+        if "who" in question_lower:
+            return "Известная личность в этой области."
+        elif "when" in question_lower:
+            return "Это произошло в значительный исторический период."
+        elif "where" in question_lower:
+            return "Место известно своей культурной значимостью."
+        elif "what" in question_lower:
+            return "Это важное понятие или объект."
+        elif "why" in question_lower:
+            return "Это произошло из-за ряда факторов."
+        elif "how" in question_lower:
+            return "Процесс включает несколько ключевых шагов."
+        return "Ответ включает несколько важных факторов."
 class EvaluationRunner:
     """
+    Управление процессом оценки: получение вопросов, запуск агента и отправка ответов.
     """
+    def __init__(self, api_url: str = DEFAULT_API_URL):
+        """Инициализация с конечными точками API."""
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
     def run_evaluation(self,
+                      agent: Callable[[str], str],
                       username: str,
+                      agent_code_url: str) -> tuple[str, pd.DataFrame]:
+        """Запуск полного процесса оценки."""
         questions_data = self._fetch_questions()
+        if isinstance(questions_data, str):
             return questions_data, None
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
+            return "Агент не дал ответов для отправки.", pd.DataFrame(results_log)
+        submission_result = self._submit_answers_with_retry(username, agent_code_url, answers_payload)
+        return submission_result, pd.DataFrame(results_log)
     def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
+        """Получение вопросов с сервера оценки."""
+        print(f"Получение вопросов с: {self.questions_url}")
         try:
             response = requests.get(self.questions_url, timeout=15)
             response.raise_for_status()
             questions_data = response.json()
             if not questions_data:
+                return "Список вопросов пуст или некорректен."
+            print(f"Успешно получено {len(questions_data)} вопросов.")
             return questions_data
         except Exception as e:
+            return f"Ошибка получения вопросов: {e}"
     def _run_agent_on_questions(self,
+                               agent: Callable[[str], str],
                                questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """Запуск агента на всех вопросах."""
         results_log = []
         answers_payload = []
+        print(f"Запуск агента на {len(questions_data)} вопросах...")
         for item in questions_data:
             task_id = item.get("task_id")
             question_text = item.get("question")
             if not task_id or question_text is None:
                 continue
             try:
+                submitted_answer = agent(question_text)
+                answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+                results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
             except Exception as e:
+                results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"ОШИБКА: {e}"})
         return results_log, answers_payload
+    def _submit_answers_with_retry(self,
+                                  username: str,
+                                  agent_code_url: str,
+                                  answers_payload: List[Dict[str, Any]]) -> str:
+        """Отправка ответов с логикой повтора."""
         submission_data = {
             "username": username.strip(),
+            "agent_code_url": agent_code_url,  # Исправленный ключ
             "answers": answers_payload
         }
+        print(f"Отправка {len(answers_payload)} ответов для пользователя '{username}'...")
+        for attempt in range(1, MAX_RETRIES + 1):
             try:
+                print(f"Попытка {attempt} из {MAX_RETRIES}...")
+                response = requests.post(self.submit_url, json=submission_data, timeout=60)
                 response.raise_for_status()
+                result_data = response.json()
+                final_status = (
+                    f"Отправка успешна!\n"
+                    f"Пользователь: {result_data.get('username')}\n"
+                    f"Общий балл: {result_data.get('overall_score', 'N/A')}\n"
+                    f"Правильные ответы: {result_data.get('correct_answers', 'N/A')}\n"
+                    f"Всего вопросов: {result_data.get('total_questions', 'N/A')}\n"
+                )
+                if all(result_data.get(key, "N/A") == "N/A" for key in ["overall_score", "correct_answers", "total_questions"]):
+                    final_status += (
+                        "\nПримечание: Результаты показывают 'N/A'. Возможные причины:\n"
+                        "- Ограничения активности аккаунта\n"
+                        "- Задержка обработки\n"
+                        "- Проблема с API\n"
+                        f"Проверьте статус: {DEFAULT_API_URL}/results?username={username}"
+                    )
+                print(final_status)
+                return final_status
+            except Exception as e:
+                if attempt < MAX_RETRIES:
+                    time.sleep(RETRY_DELAY)
                 else:
+                    return f"Ошибка отправки после {MAX_RETRIES} попыток: {e}"
+def run_and_submit_all(profile: gr.OAuthProfile | None, *args):
+    """Основная функция для запуска через Gradio."""
+    if not profile:
+        return "Пожалуйста, войдите в Hugging Face.", None
+    username = profile.username
+    space_id = os.getenv("SPACE_ID")
+    agent_code_url = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(f"URL кода агента: {agent_code_url}")
+    try:
+        agent = LLMGAIAAgent()
+        runner = EvaluationRunner()
+        return runner.run_evaluation(agent, username, agent_code_url)
+    except Exception as e:
+        return f"Ошибка инициализации: {e}", None
+# --- Интерфейс Gradio ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Оценка агента GAIA (с улучшенным LLM)")
+    gr.Markdown("## Инструкции:")
+    gr.Markdown("1. Войдите в аккаунт Hugging Face.")
+    gr.Markdown("2. Нажмите 'Запустить оценку и отправить все ответы'.")
+    gr.Markdown("3. Посмотрите результаты в разделе вывода.")
+    with gr.Row():
+        login_button = gr.LoginButton(value="Войти через Hugging Face")
+    with gr.Row():
+        submit_button = gr.Button("Запустить оценку и отправить все ответы")
+    with gr.Row():
+        output_status = gr.Textbox(label="Результат отправки", lines=10)
+        output_results = gr.Dataframe(label="Вопросы и ответы агента")
+    submit_button.click(run_and_submit_all, inputs=[login_button], outputs=[output_status, output_results])
+# --- Локальная тестовая функция ---
 def test_agent():
+    """Тестирование агента с примерами вопросов."""
+    agent = LLMGAIAAgent()
     test_questions = [
+        "What is 2 + 2?",
+        "Who is the first president of the USA?",
+        "What is the capital of France?"
     ]
     for question in test_questions:
+        answer = agent(question)
+        print(f"Вопрос: {question}")
+        print(f"Ответ: {answer}")
+        print("---")
 if __name__ == "__main__":
     test_agent()
+    # demo.launch()