FinalTest

Runtime error

App Files Files Community

yoshizen commited on May 28

Commit

b35115d

verified ·

1 Parent(s): 1dbf488

Update gaia_agent.py

Browse files

Files changed (1) hide show

gaia_agent.py +166 -509

gaia_agent.py CHANGED Viewed

@@ -1,498 +1,225 @@
 """
-Enhanced GAIA Agent with Strict Output Formatting and Answer Logging for Hugging Face Course
 """
 import os
-import re
-import math
 import json
-import datetime
-import requests
-from typing import List, Dict, Any, Optional, Union, Tuple, Callable
 import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
 class EnhancedGAIAAgent:
     """
-    An enhanced agent designed to pass the GAIA evaluation by combining rule-based precision
-    with LLM-powered flexibility and strict output formatting.
     """
-    def __init__(self, model_name="google/flan-t5-large", device=None):
-        """Initialize the agent with tools and model."""
-        self.model_name = model_name
-        print(f"EnhancedGAIAAgent initializing with model: {model_name}")
-        # Initialize LLM components
-        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
-        self._initialize_llm()
-        # Register specialized handlers
-        self.handlers = {
-            'calculation': self._handle_calculation,
-            'date_time': self._handle_date_time,
-            'list': self._handle_list_question,
-            'visual': self._handle_visual_question,
-            'factual': self._handle_factual_question,
-            'general': self._handle_general_question
-        }
-        # Define prompt templates
-        self.prompt_templates = {
-            'calculation': "Solve this step by step: {question}",
-            'date_time': "Answer this date/time question precisely: {question}",
-            'list': "Provide a comma-separated list for: {question}",
-            'visual': "Describe what is shown in the image related to: {question}",
-            'factual': "Answer this question concisely: {question}",
-            'reasoning': "Let's think step by step: {question}",
-            'general': "Provide a specific, concise answer: {question}"
-        }
-        print("EnhancedGAIAAgent initialized successfully")
-    def _initialize_llm(self):
-        """Initialize the language model for fallback responses."""
         try:
-            print(f"Loading model {self.model_name} on {self.device}")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name).to(self.device)
-            self.llm_available = True
-            print("LLM initialized successfully")
         except Exception as e:
-            print(f"Error initializing LLM: {e}")
-            self.llm_available = False
-            self.tokenizer = None
-            self.model = None
-    def __call__(self, question: str, task_id: str = None) -> str:
         """
-        Process a question and return a formatted answer according to GAIA benchmark requirements.
         Args:
-            question: The question to answer
-            task_id: Optional task ID for the GAIA benchmark
         Returns:
-            JSON string with final_answer key
         """
-        print(f"Processing question: {question}")
-        # Determine question type
-        question_type = self._classify_question(question)
-        print(f"Classified as: {question_type}")
-        # Use the appropriate handler to get the answer
-        model_answer = self.handlers[question_type](question)
-        # Ensure answer is concise and specific
-        model_answer = self._ensure_concise_answer(model_answer, question_type)
-        # FIXED: Return JSON with final_answer key
-        response = {
-            "final_answer": model_answer
-        }
-        return json.dumps(response)
-    def _generate_reasoning_trace(self, question: str, question_type: str) -> str:
-        """Generate a reasoning trace for the question if appropriate."""
-        # For calculation and reasoning questions, provide a trace
-        if question_type == 'calculation':
-            # Extract numbers and operation from the question
-            numbers = re.findall(r'\d+', question)
-            if len(numbers) >= 2:
-                if re.search(r'(sum|add|plus|\+)', question.lower()):
-                    return f"To find the sum, I add the numbers: {' + '.join(numbers)} = {sum(int(num) for num in numbers)}"
-                elif re.search(r'(difference|subtract|minus|\-)', question.lower()) and len(numbers) >= 2:
-                    return f"To find the difference, I subtract: {numbers[0]} - {numbers[1]} = {int(numbers[0]) - int(numbers[1])}"
-                elif re.search(r'(product|multiply|times|\*)', question.lower()) and len(numbers) >= 2:
-                    return f"To find the product, I multiply: {numbers[0]} × {numbers[1]} = {int(numbers[0]) * int(numbers[1])}"
-                elif re.search(r'(divide|division|\/)', question.lower()) and len(numbers) >= 2:
-                    if int(numbers[1]) != 0:
-                        return f"To find the quotient, I divide: {numbers[0]} ÷ {numbers[1]} = {int(numbers[0]) / int(numbers[1])}"
-            # If we can't generate a specific trace, use a generic one
-            return "I need to identify the numbers and operations in the question, then perform the calculation step by step."
-        elif question_type in ['factual', 'general'] and self.llm_available:
-            # For factual and general questions, use LLM to generate a trace
-            try:
-                prompt = f"Explain your reasoning for answering this question: {question}"
-                inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
-                outputs = self.model.generate(
-                    inputs["input_ids"],
-                    max_length=150,
-                    min_length=20,
-                    temperature=0.3,
-                    top_p=0.95,
-                    do_sample=True,
-                    num_return_sequences=1
-                )
-                trace = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-                return trace[:200]  # Limit trace length
-            except:
-                pass
-        # For other question types or if LLM fails, provide a minimal trace
-        return ""
-    def _classify_question(self, question: str) -> str:
-        """Determine the type of question for specialized handling."""
         question_lower = question.lower()
-        # Check for calculation questions
-        if self._is_calculation_question(question):
-            return 'calculation'
-        # Check for date/time questions
-        elif self._is_date_time_question(question):
-            return 'date_time'
-        # Check for list questions
-        elif self._is_list_question(question):
-            return 'list'
-        # Check for visual/image questions
-        elif self._is_visual_question(question):
-            return 'visual'
-        # Check for factual questions
-        elif self._is_factual_question(question):
-            return 'factual'
-        # Default to general knowledge
         else:
-            return 'general'
-    def _is_calculation_question(self, question: str) -> bool:
-        """Check if the question requires mathematical calculation."""
-        calculation_patterns = [
-            r'\d+\s*[\+\-\*\/]\s*\d+',  # Basic operations: 5+3, 10-2, etc.
-            r'(sum|add|plus|subtract|minus|multiply|divide|product|quotient)',
-            r'(calculate|compute|find|what is|how much|result)',
-            r'(square root|power|exponent|factorial|percentage|average|mean)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in calculation_patterns)
-    def _is_date_time_question(self, question: str) -> bool:
-        """Check if the question is about date or time."""
-        date_time_patterns = [
-            r'(date|time|day|month|year|hour|minute|second)',
-            r'(today|tomorrow|yesterday|current|now)',
-            r'(calendar|schedule|appointment)',
-            r'(when|how long|duration|period)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in date_time_patterns)
-    def _is_list_question(self, question: str) -> bool:
-        """Check if the question requires a list as an answer."""
-        list_patterns = [
-            r'(list|enumerate|items|elements)',
-            r'comma.separated',
-            r'(all|every|each).*(of|in)',
-            r'(provide|give).*(list)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in list_patterns)
-    def _is_visual_question(self, question: str) -> bool:
-        """Check if the question is about an image or visual content."""
-        visual_patterns = [
-            r'(image|picture|photo|graph|chart|diagram|figure)',
-            r'(show|display|illustrate|depict)',
-            r'(look|see|observe|view)',
-            r'(visual|visually)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in visual_patterns)
-    def _is_factual_question(self, question: str) -> bool:
-        """Check if the question is asking for a factual answer."""
-        factual_patterns = [
-            r'^(who|what|where|when|why|how)',
-            r'(name|identify|specify|tell me)',
-            r'(capital|president|inventor|author|creator|founder)',
-            r'(located|situated|found|discovered)'
-        ]
-        return any(re.search(pattern, question.lower()) for pattern in factual_patterns)
-    def _handle_calculation(self, question: str) -> str:
-        """Handle mathematical calculation questions with precise answers."""
-        # Extract numbers and operation from the question
-        numbers = re.findall(r'\d+', question)
-        # Try to extract a mathematical expression
-        expression_match = re.search(r'\d+\s*[\+\-\*\/]\s*\d+', question)
-        # Determine the operation
-        if re.search(r'(sum|add|plus|\+)', question.lower()) and len(numbers) >= 2:
-            result = sum(int(num) for num in numbers)
-            return str(result)
-        elif re.search(r'(difference|subtract|minus|\-)', question.lower()) and len(numbers) >= 2:
-            result = int(numbers[0]) - int(numbers[1])
-            return str(result)
-        elif re.search(r'(product|multiply|times|\*)', question.lower()) and len(numbers) >= 2:
-            result = int(numbers[0]) * int(numbers[1])
-            return str(result)
-        elif re.search(r'(divide|division|\/)', question.lower()) and len(numbers) >= 2 and int(numbers[1]) != 0:
-            result = int(numbers[0]) / int(numbers[1])
-            return str(result)
-        # For more complex calculations, try to evaluate the expression
-        elif expression_match:
-            try:
-                # Extract and clean the expression
-                expr = expression_match.group(0)
-                expr = expr.replace('plus', '+').replace('minus', '-')
-                expr = expr.replace('times', '*').replace('divided by', '/')
-                # Evaluate the expression
-                result = eval(expr)
-                return str(result)
-            except:
-                pass
-        # If rule-based approach fails, use LLM with math-specific prompt
-        return self._generate_llm_response(question, 'calculation')
-    def _handle_date_time(self, question: str) -> str:
-        """Handle date and time related questions."""
-        now = datetime.datetime.now()
-        question_lower = question.lower()
-        if re.search(r'(today|current date|what day is it)', question_lower):
-            return now.strftime("%Y-%m-%d")
-        elif re.search(r'(time now|current time|what time is it)', question_lower):
-            return now.strftime("%H:%M:%S")
-        elif re.search(r'(day of the week|what day of the week)', question_lower):
-            return now.strftime("%A")
-        elif re.search(r'(month|current month|what month is it)', question_lower):
-            return now.strftime("%B")
-        elif re.search(r'(year|current year|what year is it)', question_lower):
-            return now.strftime("%Y")
-        # For more complex date/time questions, use LLM
-        return self._generate_llm_response(question, 'date_time')
-    def _handle_list_question(self, question: str) -> str:
-        """Handle questions requiring a list as an answer."""
-        question_lower = question.lower()
-        # Common list questions with specific answers
-        if re.search(r'(fruit|fruits)', question_lower):
-            return "apple, banana, orange, grape, strawberry"
-        elif re.search(r'(vegetable|vegetables)', question_lower):
-            return "carrot, broccoli, spinach, potato, onion"
-        elif re.search(r'(country|countries)', question_lower):
-            return "USA, China, India, Russia, Brazil"
-        elif re.search(r'(capital|capitals)', question_lower):
-            return "Washington D.C., Beijing, New Delhi, Moscow, Brasilia"
-        elif re.search(r'(planet|planets)', question_lower):
-            return "Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune"
-        # For other list questions, use LLM with list-specific prompt
-        return self._generate_llm_response(question, 'list')
-    def _handle_visual_question(self, question: str) -> str:
-        """Handle questions about images or visual content."""
-        # Extract key terms from the question to customize the response
-        key_terms = re.findall(r'[a-zA-Z]{4,}', question)
-        key_term = key_terms[0].lower() if key_terms else "content"
-        # Create a contextually relevant placeholder response
-        if "graph" in question.lower() or "chart" in question.lower():
-            return f"The {key_term} graph shows an upward trend with significant data points highlighting the key metrics."
-        elif "diagram" in question.lower():
-            return f"The diagram illustrates the structure and components of the {key_term}, showing how the different parts interact."
-        elif "map" in question.lower():
-            return f"The map displays the geographical distribution of {key_term}, with notable concentrations in the regions."
-        # Default visual response
-        return f"The image shows {key_term} with distinctive features that directly address the question."
-    def _handle_factual_question(self, question: str) -> str:
-        """Handle factual questions with specific answers."""
-        question_lower = question.lower()
-        # Common factual questions with specific answers
-        if re.search(r'(capital of france|paris is the capital of)', question_lower):
-            return "Paris"
-        elif re.search(r'(first president of (the United States|USA|US))', question_lower):
-            return "George Washington"
-        elif re.search(r'(invented (the telephone|telephone))', question_lower):
-            return "Alexander Graham Bell"
-        elif re.search(r'(wrote (hamlet|romeo and juliet))', question_lower):
-            return "William Shakespeare"
-        # For other factual questions, use LLM
-        return self._generate_llm_response(question, 'factual')
-    def _handle_general_question(self, question: str) -> str:
-        """Handle general knowledge questions."""
-        # Use LLM for general questions
-        return self._generate_llm_response(question, 'general')
-    def _generate_llm_response(self, question: str, question_type: str) -> str:
-        """Generate a response using the language model."""
-        if not self.llm_available:
-            return self._fallback_response(question, question_type)
-        try:
-            # Get the appropriate prompt template
-            template = self.prompt_templates.get(question_type, self.prompt_templates['general'])
-            prompt = template.format(question=question)
-            # Generate response
-            inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(self.device)
-            outputs = self.model.generate(
-                inputs["input_ids"],
-                max_length=150,
-                min_length=10,
-                temperature=0.3,
-                top_p=0.95,
-                do_sample=True,
-                num_return_sequences=1
-            )
-            # Decode and clean up the response
-            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            response = self._clean_response(response)
-            return response
         except Exception as e:
-            print(f"Error generating LLM response: {e}")
-            return self._fallback_response(question, question_type)
-    def _clean_response(self, response: str) -> str:
-        """Clean up the model's response."""
-        # Remove any prefixes like "Answer:" or "Response:"
-        for prefix in ["Answer:", "Response:", "A:", "The answer is:", "I think", "I believe"]:
-            if response.startswith(prefix):
-                response = response[len(prefix):].strip()
-        # Remove first-person references
-        response = re.sub(r'^I would say that\s+', '', response)
-        response = re.sub(r'^In my opinion,\s+', '', response)
-        # Ensure the response is not too short
-        if len(response) < 5:
-            return "Unable to provide a specific answer to this question."
-        return response
-    def _ensure_concise_answer(self, answer: str, question_type: str) -> str:
-        """Ensure the answer is concise and specific."""
-        # Limit answer length based on question type
-        max_lengths = {
-            'calculation': 20,
-            'date_time': 30,
-            'list': 100,
-            'visual': 150,
-            'factual': 100,
-            'general': 150
-        }
-        max_length = max_lengths.get(question_type, 100)
-        # Truncate if too long, but try to keep complete sentences
-        if len(answer) > max_length:
-            # Try to find the last sentence boundary before max_length
-            last_period = answer[:max_length].rfind('.')
-            if last_period > 0:
-                answer = answer[:last_period + 1]
-            else:
-                answer = answer[:max_length]
-        return answer
-    def _fallback_response(self, question: str, question_type: str) -> str:
-        """Provide a fallback response if the model fails."""
-        # Fallback responses based on question type
-        fallbacks = {
-            'calculation': "42",
-            'date_time': "2023-01-01",
-            'list': "item1, item2, item3, item4, item5",
-            'visual': "The image shows the main subject clearly visible in the center with relevant details surrounding it.",
-            'factual': "This is a factual answer to your specific question.",
-            'general': "The answer involves multiple factors that must be considered in context."
-        }
-        return fallbacks.get(question_type, "I don't have enough information to answer this question specifically.")
 class EvaluationRunner:
     """
-    Handles the evaluation process: fetching questions, running the agent,
-    and submitting answers to the evaluation server.
     """
     def __init__(self, api_url="https://agents-course-unit4-scoring.hf.space"):
-        """Initialize with API endpoints."""
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
         self.results_url = f"{api_url}/results"
-        self.total_questions = 0
         self.correct_answers = 0
     def run_evaluation(self,
                       agent: Any,
                       username: str,
-                      agent_code_url: str) -> tuple[str, Any]:
         """
-        Run the full evaluation process:
-        1. Fetch questions
-        2. Run agent on all questions
-        3. Submit answers
-        4. Check results and count correct answers
-        5. Return results
         """
-        # Reset counters
-        self.total_questions = 0
-        self.correct_answers = 0
-        # Fetch questions
         questions_data = self._fetch_questions()
-        if isinstance(questions_data, str):  # Error message
             return questions_data, None
-        # Run agent on all questions
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
             return "Agent did not produce any answers to submit.", results_log
-        # Submit answers
-        submission_result = self._submit_answers(username, agent_code_url, answers_payload)
-        # Try to fetch results to count correct answers
-        self._check_results(username)
-        # Return results with correct answer count
         return submission_result, results_log
     def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
-        """Fetch questions from the evaluation server."""
         print(f"Fetching questions from: {self.questions_url}")
         try:
             response = requests.get(self.questions_url, timeout=15)
@@ -527,7 +254,7 @@ class EvaluationRunner:
     def _run_agent_on_questions(self,
                                agent: Any,
                                questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
-        """Run the agent on all questions and collect results."""
         results_log = []
         answers_payload = []
@@ -541,13 +268,13 @@ class EvaluationRunner:
                 continue
             try:
-                # Call agent with task_id to ensure proper formatting
                 json_response = agent(question_text, task_id)
-                # Parse the JSON response
                 response_obj = json.loads(json_response)
-                # Extract the final_answer for submission
                 submitted_answer = response_obj.get("final_answer", "")
                 answers_payload.append({
@@ -573,18 +300,19 @@ class EvaluationRunner:
     def _submit_answers(self,
                        username: str,
-                       agent_code_url: str,
                        answers_payload: List[Dict[str, Any]]) -> str:
-        """Submit answers to the evaluation server."""
         submission_data = {
             "username": username.strip(),
-            "agent_code_url": agent_code_url.strip(),
             "answers": answers_payload
         }
         print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}")
         max_retries = 3
-        retry_delay = 5  # seconds
         for attempt in range(1, max_retries + 1):
             try:
@@ -603,7 +331,7 @@ class EvaluationRunner:
                     max_score = result.get("max_score")
                     if score is not None and max_score is not None:
-                        self.correct_answers = score  # Update correct answers count
                         return f"Evaluation complete! Score: {score}/{max_score}"
                     else:
                         print(f"Received N/A results. Waiting {retry_delay} seconds before retry...")
@@ -626,11 +354,11 @@ class EvaluationRunner:
                 else:
                     return f"Error submitting answers after {max_retries} attempts: {e}"
-        # If we get here, all retries failed but didn't raise exceptions
         return "Submission Successful, but results are pending!"
     def _check_results(self, username: str) -> None:
-        """Check results to count correct answers."""
         try:
             results_url = f"{self.results_url}?username={username}"
             print(f"Checking results at: {results_url}")
@@ -656,15 +384,15 @@ class EvaluationRunner:
             print(f"Error checking results: {e}")
     def get_correct_answers_count(self) -> int:
-        """Get the number of correct answers."""
         return self.correct_answers
     def get_total_questions_count(self) -> int:
-        """Get the total number of questions."""
         return self.total_questions
     def print_evaluation_summary(self, username: str) -> None:
-        """Print a summary of the evaluation results."""
         print("\n===== EVALUATION SUMMARY =====")
         print(f"User: {username}")
         print(f"Overall Score: {self.correct_answers}/{self.total_questions}")
@@ -672,74 +400,3 @@ class EvaluationRunner:
         print(f"Total Questions: {self.total_questions}")
         print(f"Accuracy: {(self.correct_answers / self.total_questions * 100) if self.total_questions > 0 else 0:.1f}%")
         print("=============================\n")
-# Example usage and test cases
-def test_agent():
-    """Test the agent with example questions."""
-    agent = EnhancedGAIAAgent()
-    test_questions = [
-        # Calculation questions
-        "What is 25 + 17?",
-        "Calculate the product of 8 and 9",
-        # Date/time questions
-        "What is today's date?",
-        "What day of the week is it?",
-        # List questions
-        "List five fruits",
-        "What are the planets in our solar system?",
-        # Visual questions
-        "What does the image show?",
-        "Describe the chart in the image",
-        # Factual questions
-        "Who was the first president of the United States?",
-        "What is the capital of France?",
-        "How does photosynthesis work?",
-        # General questions
-        "Why is the sky blue?",
-        "What are the implications of quantum mechanics?"
-    ]
-    print("\n=== AGENT TEST RESULTS ===")
-    correct_count = 0
-    total_count = len(test_questions)
-    for question in test_questions:
-        # Generate a mock task_id for testing
-        task_id = f"test_{hash(question) % 10000}"
-        # Get JSON response with final_answer
-        json_response = agent(question, task_id)
-        print(f"\nQ: {question}")
-        print(f"Response: {json_response}")
-        # Parse and print the final_answer for clarity
-        try:
-            response_obj = json.loads(json_response)
-            final_answer = response_obj.get('final_answer', '')
-            print(f"Final Answer: {final_answer}")
-            # For testing purposes, simulate correct answers
-            if len(final_answer) > 0 and not final_answer.startswith("AGENT ERROR"):
-                correct_count += 1
-        except:
-            print("Error parsing JSON response")
-    # Print test summary with correct answer count
-    print("\n===== TEST SUMMARY =====")
-    print(f"Correct Answers: {correct_count}/{total_count}")
-    print(f"Accuracy: {(correct_count / total_count * 100):.1f}%")
-    print("=======================\n")
-    return "Test completed successfully"
-if __name__ == "__main__":
-    test_agent()

 """
+Улучшенный GAIA Agent с поддержкой кэширования ответов
 """
 import os
 import json
+import time
 import torch
+import requests
+from typing import List, Dict, Any, Optional, Union
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# Константы
+CACHE_FILE = "gaia_answers_cache.json"
 class EnhancedGAIAAgent:
     """
+    Улучшенный агент для Hugging Face GAIA с поддержкой кэширования ответов
     """
+    def __init__(self, model_name="google/flan-t5-small", use_cache=True):
+        """
+        Инициализация агента с моделью и кэшем
+        Args:
+            model_name: Название модели для загрузки
+            use_cache: Использовать ли кэширование ответов
+        """
+        print(f"Initializing EnhancedGAIAAgent with model: {model_name}")
+        self.model_name = model_name
+        self.use_cache = use_cache
+        self.cache = self._load_cache() if use_cache else {}
+        # Загружаем модель и токенизатор
+        print("Loading tokenizer...")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        print("Loading model...")
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+        print("Model and tokenizer loaded successfully")
+    def _load_cache(self) -> Dict[str, str]:
+        """
+        Загружает кэш ответов из файла
+        Returns:
+            Dict[str, str]: Словарь с кэшированными ответами
+        """
+        if os.path.exists(CACHE_FILE):
+            try:
+                with open(CACHE_FILE, 'r', encoding='utf-8') as f:
+                    print(f"Loading cache from {CACHE_FILE}")
+                    return json.load(f)
+            except Exception as e:
+                print(f"Error loading cache: {e}")
+                return {}
+        else:
+            print(f"Cache file {CACHE_FILE} not found, creating new cache")
+            return {}
+    def _save_cache(self) -> None:
+        """
+        Сохраняет кэш ответов в файл
+        """
         try:
+            with open(CACHE_FILE, 'w', encoding='utf-8') as f:
+                json.dump(self.cache, f, ensure_ascii=False, indent=2)
+                print(f"Cache saved to {CACHE_FILE}")
         except Exception as e:
+            print(f"Error saving cache: {e}")
+    def _classify_question(self, question: str) -> str:
         """
+        Классифицирует вопрос по типу для лучшего форматирования ответа
         Args:
+            question: Текст вопроса
         Returns:
+            str: Тип вопроса (factual, calculation, list, date_time, etc.)
         """
+        # Простая эвристическая классификация
         question_lower = question.lower()
+        if any(word in question_lower for word in ["calculate", "sum", "product", "divide", "multiply", "add", "subtract", "how many"]):
+            return "calculation"
+        elif any(word in question_lower for word in ["list", "enumerate", "items", "elements"]):
+            return "list"
+        elif any(word in question_lower for word in ["date", "time", "day", "month", "year", "when"]):
+            return "date_time"
         else:
+            return "factual"
+    def _format_answer(self, raw_answer: str, question_type: str) -> str:
+        """
+        Форматирует ответ в соответствии с типом вопроса
+        Args:
+            raw_answer: Необработанный ответ от модели
+            question_type: Тип вопроса
+        Returns:
+            str: Отформатированный ответ
+        """
+        # Удаляем лишние пробелы и переносы строк
+        answer = raw_answer.strip()
+        # Удаляем префиксы, которые часто добавляет модель
+        prefixes = ["Answer:", "The answer is:", "I think", "I believe", "According to", "Based on"]
+        for prefix in prefixes:
+            if answer.startswith(prefix):
+                answer = answer[len(prefix):].strip()
+        # Специфическое форматирование в зависимости от типа вопроса
+        if question_type == "calculation":
+            # Для числовых ответов удаляем лишний текст
+            # Оставляем только числа, если они есть
+            import re
+            numbers = re.findall(r'-?\d+\.?\d*', answer)
+            if numbers:
+                answer = numbers[0]
+        elif question_type == "list":
+            # Для списков убеждаемся, что элементы разделены запятыми
+            if "," not in answer and " " in answer:
+                items = [item.strip() for item in answer.split() if item.strip()]
+                answer = ", ".join(items)
+        return answer
+    def __call__(self, question: str, task_id: Optional[str] = None) -> str:
+        """
+        Обрабатывает вопрос и возвращает ответ
+        Args:
+            question: Текст вопроса
+            task_id: Идентификатор задачи (опционально)
+        Returns:
+            str: Ответ в формате JSON с ключом final_answer
+        """
+        # Создаем ключ для кэша (используем task_id, если доступен)
+        cache_key = task_id if task_id else question
+        # Проверяем наличие ответа в кэше
+        if self.use_cache and cache_key in self.cache:
+            print(f"Cache hit for question: {question[:50]}...")
+            return self.cache[cache_key]
+        # Классифицируем вопрос
+        question_type = self._classify_question(question)
+        print(f"Processing question: {question[:100]}...")
+        print(f"Classified as: {question_type}")
+        try:
+            # Генерируем ответ с помощью модели
+            inputs = self.tokenizer(question, return_tensors="pt")
+            outputs = self.model.generate(**inputs, max_length=100)
+            raw_answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Форматируем ответ
+            formatted_answer = self._format_answer(raw_answer, question_type)
+            # Формируем JSON-ответ
+            result = {"final_answer": formatted_answer}
+            json_response = json.dumps(result)
+            # Сохраняем в кэш
+            if self.use_cache:
+                self.cache[cache_key] = json_response
+                self._save_cache()
+            return json_response
         except Exception as e:
+            error_msg = f"Error generating answer: {e}"
+            print(error_msg)
+            return json.dumps({"final_answer": f"AGENT ERROR: {e}"})
 class EvaluationRunner:
     """
+    Обрабатывает процесс оценки: получение вопросов, запуск агента,
+    и отправку ответов на сервер оценки.
     """
     def __init__(self, api_url="https://agents-course-unit4-scoring.hf.space"):
+        """Инициализация с API endpoints."""
         self.api_url = api_url
         self.questions_url = f"{api_url}/questions"
         self.submit_url = f"{api_url}/submit"
         self.results_url = f"{api_url}/results"
         self.correct_answers = 0
+        self.total_questions = 0
     def run_evaluation(self,
                       agent: Any,
                       username: str,
+                      agent_code: str) -> tuple[str, List[Dict[str, Any]]]:
         """
+        Запускает полный процесс оценки:
+        1. Получает вопросы
+        2. Запускает агента на всех вопросах
+        3. Отправляет ответы
+        4. Возвращает результаты
         """
+        # Получаем вопросы
         questions_data = self._fetch_questions()
+        if isinstance(questions_data, str):  # Сообщение об ошибке
             return questions_data, None
+        # Запускаем агента на всех вопросах
         results_log, answers_payload = self._run_agent_on_questions(agent, questions_data)
         if not answers_payload:
             return "Agent did not produce any answers to submit.", results_log
+        # Отправляем ответы с логикой повторных попыток
+        submission_result = self._submit_answers(username, agent_code, answers_payload)
+        # Возвращаем результаты
         return submission_result, results_log
     def _fetch_questions(self) -> Union[List[Dict[str, Any]], str]:
+        """Получает вопросы с сервера оценки."""
         print(f"Fetching questions from: {self.questions_url}")
         try:
             response = requests.get(self.questions_url, timeout=15)
     def _run_agent_on_questions(self,
                                agent: Any,
                                questions_data: List[Dict[str, Any]]) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """Запускает аге��та на всех вопросах и собирает результаты."""
         results_log = []
         answers_payload = []
                 continue
             try:
+                # Вызываем агента с task_id для правильного форматирования
                 json_response = agent(question_text, task_id)
+                # Парсим JSON-ответ
                 response_obj = json.loads(json_response)
+                # Извлекаем final_answer для отправки
                 submitted_answer = response_obj.get("final_answer", "")
                 answers_payload.append({
     def _submit_answers(self,
                        username: str,
+                       agent_code: str,
                        answers_payload: List[Dict[str, Any]]) -> str:
+        """Отправляет ответы на сервер оценки."""
+        # ИСПРАВЛЕНО: Используем agent_code вместо agent_code_url
         submission_data = {
             "username": username.strip(),
+            "agent_code": agent_code.strip(),  # Исправлено здесь
             "answers": answers_payload
         }
         print(f"Submitting {len(answers_payload)} answers to: {self.submit_url}")
         max_retries = 3
+        retry_delay = 5  # секунд
         for attempt in range(1, max_retries + 1):
             try:
                     max_score = result.get("max_score")
                     if score is not None and max_score is not None:
+                        self.correct_answers = score  # Обновляем счетчик правильных ответов
                         return f"Evaluation complete! Score: {score}/{max_score}"
                     else:
                         print(f"Received N/A results. Waiting {retry_delay} seconds before retry...")
                 else:
                     return f"Error submitting answers after {max_retries} attempts: {e}"
+        # Если мы здесь, все попытки не удались, но не вызвали исключений
         return "Submission Successful, but results are pending!"
     def _check_results(self, username: str) -> None:
+        """Проверяет результаты для подсчета правильных ответов."""
         try:
             results_url = f"{self.results_url}?username={username}"
             print(f"Checking results at: {results_url}")
             print(f"Error checking results: {e}")
     def get_correct_answers_count(self) -> int:
+        """Возвращает количество правильных ответов."""
         return self.correct_answers
     def get_total_questions_count(self) -> int:
+        """Возвращает общее количество вопросов."""
         return self.total_questions
     def print_evaluation_summary(self, username: str) -> None:
+        """Выводит сводку результатов оценки."""
         print("\n===== EVALUATION SUMMARY =====")
         print(f"User: {username}")
         print(f"Overall Score: {self.correct_answers}/{self.total_questions}")
         print(f"Total Questions: {self.total_questions}")
         print(f"Accuracy: {(self.correct_answers / self.total_questions * 100) if self.total_questions > 0 else 0:.1f}%")
         print("=============================\n")