File size: 153,128 Bytes

e8a0a6a

# recursive_swe_bench/task_generators/bug_fixing.py

from typing import Any, Dict, List, Optional, Tuple, Set, Union
import uuid
import json
import re
import random
import ast
import copy
from pathlib import Path
import tempfile
import subprocess
import shutil
import os

from recursive_swe_bench.core.recursive_task import (
    RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus
)

class BugCategory:
    """Categories of bugs for classification and evolution."""
    SYNTAX = "syntax"
    LOGICAL = "logical"
    PERFORMANCE = "performance"
    SECURITY = "security"
    CONCURRENCY = "concurrency"
    EXCEPTION_HANDLING = "exception_handling"
    API_USAGE = "api_usage"
    MEMORY_MANAGEMENT = "memory_management"
    TYPE_ERROR = "type_error"
    EDGE_CASE = "edge_case"
    DATA_HANDLING = "data_handling"
    DEPENDENCY = "dependency"


class BugFixingTask(RecursiveTask):
    """
    A recursive task for evaluating how models fix bugs in code.
    
    The task presents a piece of code with one or more bugs, and evolves
    based on the model's fix attempts. As the model addresses issues,
    the task may introduce more subtle bugs, change requirements, or
    increase complexity to test adaptive problem-solving.
    """
    
    def __init__(
        self,
        initial_state: ProblemState,
        config: Dict[str, Any] = None,
        test_runner: Any = None
    ):
        """
        Initialize the bug fixing task.
        
        Args:
            initial_state: The initial problem state
            config: Configuration options
            test_runner: Custom test runner (optional)
        """
        super().__init__(initial_state, config)
        self.test_runner = test_runner or DefaultTestRunner()
        self.bug_categories: Set[str] = set(
            self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX])
        )
        self.difficulty_progression = self.config.get(
            "difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7]
        )
        self.evolution_strategies = self.config.get(
            "evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"]
        )
        
    def _run_evaluation(self, solution: str) -> EvaluationResult:
        """
        Run tests to evaluate the solution.
        
        Args:
            solution: The solution code
            
        Returns:
            Evaluation results
        """
        # Create a temporary directory to run tests
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            # Write solution code to file
            solution_file = temp_path / "solution.py"
            with open(solution_file, "w") as f:
                f.write(solution)
            
            # Create test files
            test_files = self._create_test_files(temp_path)
            
            # Run tests
            results = self.test_runner.run_tests(
                solution_file=solution_file,
                test_files=test_files,
                code_context=self.state.code_context
            )
            
            # Calculate score based on test results
            score = self._calculate_score(results)
            
            return EvaluationResult(
                success=results["all_passed"],
                score=score,
                execution_results=results["execution"],
                error_details=results.get("errors"),
                test_results=results["tests"],
                metrics={
                    "passed_tests": results["passed_tests"],
                    "total_tests": results["total_tests"],
                    "execution_time": results["execution_time"],
                    "memory_usage": results.get("memory_usage", 0),
                    "code_complexity": self._calculate_complexity(solution)
                }
            )
    
    def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback:
        """
        Generate structured feedback based on evaluation results.
        
        Args:
            solution: The solution code
            result: The evaluation results
            
        Returns:
            Structured feedback
        """
        issues = []
        suggestions = []
        focus_areas = []
        
        # Add issues for failing tests
        if result.test_results:
            for test_name, test_result in result.test_results.items():
                if not test_result["passed"]:
                    issues.append({
                        "type": "test_failure",
                        "test": test_name,
                        "message": test_result.get("message", "Test failed"),
                        "expected": test_result.get("expected"),
                        "actual": test_result.get("actual")
                    })
        
        # Add issues for errors
        if result.error_details:
            for error_type, error_info in result.error_details.items():
                issues.append({
                    "type": "error",
                    "error_type": error_type,
                    "message": error_info.get("message", "An error occurred"),
                    "location": error_info.get("location")
                })
        
        # Generate suggestions based on issues
        for issue in issues:
            if issue["type"] == "test_failure":
                suggestion = self._generate_suggestion_for_test_failure(
                    issue, solution, result.test_results
                )
                if suggestion:
                    suggestions.append(suggestion)
            elif issue["type"] == "error":
                suggestion = self._generate_suggestion_for_error(
                    issue, solution
                )
                if suggestion:
                    suggestions.append(suggestion)
        
        # Determine focus areas based on issues and task state
        focus_areas = self._determine_focus_areas(issues, solution, result)
        
        # Generate adaptation hints based on the current state and results
        adaptation_hints = self._generate_adaptation_hints(solution, result)
        
        # Create summary
        if result.success:
            summary = (
                f"Your solution passes all tests with a score of {result.score:.2f}. "
                f"The code successfully addresses the bugs in the original implementation."
            )
        else:
            passed = result.metrics.get("passed_tests", 0)
            total = result.metrics.get("total_tests", 0)
            summary = (
                f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. "
                f"There are still issues that need to be addressed."
            )
        
        return Feedback(
            summary=summary,
            issues=issues,
            suggestions=suggestions,
            focus_areas=focus_areas,
            adaptation_hints=adaptation_hints
        )
    
    def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
        """
        Evolve the problem state based on the solution and feedback.
        
        This method implements the recursive nature of the benchmark by
        adapting the problem to challenge the model's understanding.
        
        Args:
            solution: The attempted solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            The evolved problem state
        """
        # If the solution perfectly solved the problem, make it more challenging
        if result.success and result.score > 0.95:
            return self._increase_difficulty(solution, result, feedback)
        
        # If the solution was close but not perfect, focus on the remaining issues
        elif result.score > 0.7:
            return self._focus_remaining_issues(solution, result, feedback)
            
        # If the solution was not very good, provide more guidance
        else:
            return self._provide_more_guidance(solution, result, feedback)
    
    def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
        """
        Increase the difficulty of the problem for models that solved it well.
        
        Args:
            solution: The successful solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            The evolved problem state with increased difficulty
        """
        # Create a new state based on the current state
        new_state = copy.deepcopy(self.state)
        
        # Increment evolution stage
        new_state.evolution_stage += 1
        
        # Increase difficulty based on progression schedule
        current_difficulty_idx = min(new_state.evolution_stage, 
                                    len(self.difficulty_progression) - 1)
        new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
        
        # Select an evolution strategy based on the current state
        strategy = self._select_evolution_strategy(solution, result, feedback)
        
        # Apply the selected strategy
        if strategy == "add_subtle_bug":
            self._add_subtle_bug(new_state, solution)
        elif strategy == "change_requirements":
            self._change_requirements(new_state, solution)
        elif strategy == "increase_complexity":
            self._increase_complexity(new_state, solution)
        
        # Update the description to reflect the changes
        new_state.description = self._generate_description(new_state)
        
        # Update adaptation vector to guide future evolution
        new_state.adaptation_vector = self._calculate_adaptation_vector(
            solution, result, feedback
        )
        
        return new_state
    
    def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
        """
        Evolve the state to focus on remaining issues when the solution is close but not perfect.
        
        Args:
            solution: The nearly-successful solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            The evolved problem state focusing on remaining issues
        """
        # Create a new state based on the current state
        new_state = copy.deepcopy(self.state)
        
        # Increment evolution stage
        new_state.evolution_stage += 1
        
        # Maintain the same difficulty level
        current_difficulty_idx = min(new_state.evolution_stage - 1, 
                                    len(self.difficulty_progression) - 1)
        new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
        
        # Update the code context to focus on remaining issues
        new_state.code_context["focus_areas"] = feedback.focus_areas
        
        # Highlight failing tests in the code context
        if result.test_results:
            failing_tests = [
                test_name for test_name, test_result in result.test_results.items()
                if not test_result["passed"]
            ]
            new_state.code_context["failing_tests"] = failing_tests
        
        # Update the description to be more specific about remaining issues
        new_state.description = self._generate_focused_description(
            new_state, feedback.issues
        )
        
        # Update adaptation vector to guide future evolution
        new_state.adaptation_vector = self._calculate_adaptation_vector(
            solution, result, feedback
        )
        
        return new_state
    
    def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
        """
        Evolve the state to provide more guidance when the solution was not very good.
        
        Args:
            solution: The unsuccessful solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            The evolved problem state with more guidance
        """
        # Create a new state based on the current state
        new_state = copy.deepcopy(self.state)
        
        # Increment evolution stage
        new_state.evolution_stage += 1
        
        # Maintain or slightly decrease difficulty
        current_difficulty_idx = max(0, min(new_state.evolution_stage - 1, 
                                          len(self.difficulty_progression) - 1) - 1)
        new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
        
        # Add more hints to the code context
        new_state.code_context["hints"] = self._generate_hints(
            solution, result, feedback
        )
        
        # Add more detailed information about failing tests
        if result.test_results:
            detailed_test_results = {}
            for test_name, test_result in result.test_results.items():
                if not test_result["passed"]:
                    detailed_test_results[test_name] = {
                        "message": test_result.get("message", "Test failed"),
                        "expected": test_result.get("expected"),
                        "actual": test_result.get("actual"),
                        "hint": self._generate_test_hint(test_name, test_result)
                    }
            new_state.code_context["detailed_test_results"] = detailed_test_results
        
        # Update the description to include more guidance
        new_state.description = self._generate_guided_description(
            new_state, feedback.issues, feedback.suggestions
        )
        
        # Update adaptation vector to guide future evolution
        new_state.adaptation_vector = self._calculate_adaptation_vector(
            solution, result, feedback
        )
        
        return new_state
    
    def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str:
        """
        Select an evolution strategy based on the current state and solution.
        
        Args:
            solution: The current solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            The selected evolution strategy
        """
        available_strategies = self.evolution_strategies.copy()
        
        # Weight the strategies based on the current state
        weights = {}
        
        # Prefer adding subtle bugs if the solution is very good
        if result.score > 0.95:
            weights["add_subtle_bug"] = 0.6
            weights["change_requirements"] = 0.3
            weights["increase_complexity"] = 0.1
        
        # Prefer changing requirements if we've already added several bugs
        elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3:
            weights["add_subtle_bug"] = 0.1
            weights["change_requirements"] = 0.7
            weights["increase_complexity"] = 0.2
            
        # Prefer increasing complexity if the solution is good but not perfect
        elif result.score > 0.85:
            weights["add_subtle_bug"] = 0.2
            weights["change_requirements"] = 0.2
            weights["increase_complexity"] = 0.6
            
        # Default to equal weights
        else:
            weights = {strategy: 1.0 / len(available_strategies) 
                      for strategy in available_strategies}
        
        # Normalize weights for available strategies
        total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies)
        normalized_weights = [weights.get(strategy, 0) / total_weight 
                             for strategy in available_strategies]
        
        # Select a strategy based on weights
        return random.choices(available_strategies, weights=normalized_weights)[0]
    
    def _add_subtle_bug(self, state: ProblemState, solution: str) -> None:
        """
        Add a subtle bug to the solution code.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Parse the solution to find potential bug insertion points
        try:
            parsed_solution = ast.parse(solution)
        except SyntaxError:
            # If we can't parse the solution, just add a syntax error
            self._add_syntax_error(state, solution)
            return
        
        # Choose a bug category based on available categories
        available_categories = list(self.bug_categories)
        if available_categories:
            bug_category = random.choice(available_categories)
        else:
            bug_category = BugCategory.LOGICAL
        
        # Add a bug based on the selected category
        if bug_category == BugCategory.SYNTAX:
            self._add_syntax_error(state, solution)
        elif bug_category == BugCategory.LOGICAL:
            self._add_logical_error(state, solution, parsed_solution)
        elif bug_category == BugCategory.PERFORMANCE:
            self._add_performance_issue(state, solution, parsed_solution)
        elif bug_category == BugCategory.EDGE_CASE:
            self._add_edge_case_issue(state, solution, parsed_solution)
        else:
            # Default to logical error
            self._add_logical_error(state, solution, parsed_solution)
        
        # Update bug count in code context
        if "bug_count" not in state.code_context:
            state.code_context["bug_count"] = 0
        state.code_context["bug_count"] += 1
        
        # Add the bug category to the context
        if "bug_categories" not in state.code_context:
            state.code_context["bug_categories"] = []
        state.code_context["bug_categories"].append(bug_category)
    
    def _change_requirements(self, state: ProblemState, solution: str) -> None:
        """
        Change the requirements to challenge the current solution.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Get the current requirements
        requirements = state.requirements
        
        # Add a new requirement
        new_requirement = self._generate_new_requirement(state, solution)
        if new_requirement:
            requirements.append(new_requirement)
        
        # Modify an existing requirement if possible
        if requirements and random.random() < 0.5:
            idx = random.randint(0, len(requirements) - 1)
            requirements[idx] = self._modify_requirement(requirements[idx], state, solution)
    
    def _increase_complexity(self, state: ProblemState, solution: str) -> None:
        """
        Increase the complexity of the task.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Parse the solution if possible
        try:
            parsed_solution = ast.parse(solution)
        except SyntaxError:
            # If we can't parse the solution, make a simpler change
            self._add_edge_case_requirement(state)
            return
        
        # Choose a complexity increase strategy
        strategies = [
            "add_edge_cases",
            "increase_data_volume",
            "add_performance_constraint",
            "expand_functionality"
        ]
        
        strategy = random.choice(strategies)
        
        if strategy == "add_edge_cases":
            self._add_edge_case_requirement(state)
        elif strategy == "increase_data_volume":
            self._increase_data_volume(state, solution)
        elif strategy == "add_performance_constraint":
            self._add_performance_constraint(state, solution)
        elif strategy == "expand_functionality":
            self._expand_functionality(state, solution)
    
    def _create_test_files(self, temp_path: Path) -> List[Path]:
        """
        Create test files based on the current problem state.
        
        Args:
            temp_path: The temporary directory path
            
        Returns:
            List of test file paths
        """
        test_files = []
        
        # Create test files from the code context
        if "tests" in self.state.code_context:
            for i, test in enumerate(self.state.code_context["tests"]):
                test_file = temp_path / f"test_{i}.py"
                with open(test_file, "w") as f:
                    f.write(test["content"])
                test_files.append(test_file)
        
        # Create a default test file if no tests are specified
        if not test_files:
            test_file = temp_path / "test_default.py"
            with open(test_file, "w") as f:
                f.write(self._generate_default_test())
            test_files.append(test_file)
        
        return test_files
    
    def _calculate_score(self, results: Dict[str, Any]) -> float:
        """
        Calculate a score based on test results.
        
        Args:
            results: The test results
            
        Returns:
            A score between 0 and 1
        """
        # Base score on test results
        if results["total_tests"] == 0:
            test_score = 0.0
        else:
            test_score = results["passed_tests"] / results["total_tests"]
        
        # Adjust for execution success
        execution_score = 1.0 if results["execution"]["success"] else 0.0
        
        # Combine scores with weights
        weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3})
        score = (test_score * weights["test"] + execution_score * weights["execution"])
        
        # Apply difficulty modifier
        difficulty_modifier = 1.0 + (self.state.difficulty * 0.2)
        score = score / difficulty_modifier
        
        return max(0.0, min(1.0, score))
    
    def _calculate_complexity(self, code: str) -> float:
        """
        Calculate the complexity of code.
        
        Args:
            code: The code to analyze
            
        Returns:
            A complexity score
        """
        # Simple cyclomatic complexity estimation
        complexity = 1
        
        # Count control flow statements
        for pattern in ["if", "for", "while", "and", "or"]:
            complexity += code.count(f" {pattern} ")
        
        # Count function definitions
        complexity += code.count("def ")
        
        # Normalize to 0-1 range
        normalized = min(1.0, complexity / 50.0)
        
        return normalized
    
    def _generate_suggestion_for_test_failure(
        self,
        issue: Dict[str, Any],
        solution: str,
        test_results: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Generate a suggestion for a test failure.
        
        Args:
            issue: The issue data
            solution: The solution code
            test_results: The test results
            
        Returns:
            A suggestion dictionary
        """
        test_name = issue["test"]
        test_result = test_results[test_name]
        
        # Extract relevant parts of the test
        test_content = None
        for test in self.state.code_context.get("tests", []):
            if test.get("name") == test_name:
                test_content = test.get("content")
                break
        
        if test_content:
            # Try to extract the assertion that failed
            assertion_match = re.search(r"assert.*", test_content)
            assertion = assertion_match.group(0) if assertion_match else None
            
            # Look for function names in both test and solution
            test_funcs = re.findall(r"def\s+(\w+)", test_content)
            solution_funcs = re.findall(r"def\s+(\w+)", solution)
            
            # Find functions in test that aren't in solution
            missing_funcs = [f for f in test_funcs if f not in solution_funcs]
            
            if missing_funcs:
                return {
                    "type": "missing_function",
                    "message": f"Implement the missing function(s): {', '.join(missing_funcs)}",
                    "functions": missing_funcs
                }
            elif assertion:
                return {
                    "type": "fix_assertion_failure",
                    "message": f"Fix the code to pass the assertion: {assertion}",
                    "assertion": assertion,
                    "expected": test_result.get("expected"),
                    "actual": test_result.get("actual")
                }
            else:
                return {
                    "type": "fix_test_failure",
                    "message": f"Fix the code to pass the test: {test_name}",
                    "test_name": test_name
                }
        else:
            return {
                "type": "general_fix",
                "message": f"Fix the code to pass the failing test: {test_name}"
            }
    
    def _generate_suggestion_for_error(
        self,
        issue: Dict[str, Any],
        solution: str
    ) -> Dict[str, Any]:
        """
        Generate a suggestion for an error.
        
        Args:
            issue: The issue data
            solution: The solution code
            
        Returns:
            A suggestion dictionary
        """
        error_type = issue["error_type"]
        message = issue["message"]
        location = issue.get("location")
        
        if error_type == "syntax":
            return {
                "type": "fix_syntax",
                "message": f"Fix the syntax error: {message}",
                "location": location
            }
        elif error_type == "runtime":
            return {
                "type": "fix_runtime_error",
                "message": f"Fix the runtime error: {message}",
                "location": location
            }
        else:
            return {
                "type": "fix_error",
                "message": f"Fix the error: {message}",
                "error_type": error_type,
                "location": location
            }
    
    def _determine_focus_areas(
        self,
        issues: List[Dict[str, Any]],
        solution: str,
        result: EvaluationResult
    ) -> List[str]:
        """
        Determine focus areas based on issues and results.
        
        Args:
            issues: The identified issues
            solution: The solution code
            result: The evaluation results
            
        Returns:
            List of focus areas
        """
        focus_areas = []
        
        # Check for syntax issues
        syntax_issues = [i for i in issues if i.get("error_type") == "syntax"]
        if syntax_issues:
            focus_areas.append("syntax")
        
        # Check for failing tests
        test_issues = [i for i in issues if i["type"] == "test_failure"]
        if test_issues:
            if any("expected" in i and "actual" in i for i in test_issues):
                focus_areas.append("logic")
            else:
                focus_areas.append("functionality")
        
        # Check for performance issues
        if result.metrics and "execution_time" in result.metrics:
            if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0):
                focus_areas.append("performance")
        
        # Check for complexity issues
        if result.metrics and "code_complexity" in result.metrics:
            if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7):
                focus_areas.append("complexity")
        
        # Default focus area if none were identified
        if not focus_areas:
            focus_areas.append("general")
        
        return focus_areas
    
    def _generate_adaptation_hints(
        self,
        solution: str,
        result: EvaluationResult
    ) -> List[Dict[str, Any]]:
        """
        Generate hints about how the problem might adapt in the next iteration.
        
        Args:
            solution: The solution code
            result: The evaluation results
            
        Returns:
            List of adaptation hints
        """
        hints = []
        
        # Hint about potential complexity increases
        if result.score > 0.8:
            hints.append({
                "type": "complexity_increase",
                "message": "The problem may become more complex in the next iteration."
            })
        
        # Hint about potential requirement changes
        if result.score > 0.9 and self.state.evolution_stage >= 1:
            hints.append({
                "type": "requirement_change",
                "message": "The requirements may change in the next iteration."
            })
        
        # Hint about potential bug additions
        if result.score > 0.95:
            hints.append({
                "type": "new_bugs",
                "message": "New, more subtle bugs may be introduced in the next iteration."
            })
        
        # Hint about focus on specific areas
        if result.score > 0.7 and result.score < 0.95:
            focus_areas = result.metrics.get("focus_areas", [])
            if focus_areas:
                hints.append({
                    "type": "focus_shift",
                    "message": f"The next iteration may focus more on: {', '.join(focus_areas)}",
                    "areas": focus_areas
                })
        
        return hints
    
    def _generate_description(self, state: ProblemState) -> str:
        """
        Generate a description for the current problem state.
        
        Args:
            state: The problem state
            
        Returns:
            A descriptive prompt for the problem
        """
        # Base description
        base_desc = (
            f"Fix the bug(s) in the following code. "
            f"This is iteration {state.evolution_stage + 1} of the task."
        )
        
        # Add information about known bug categories
        if "bug_categories" in state.code_context:
            categories = state.code_context["bug_categories"]
            if categories:
                base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}."
        
        # Add requirements
        if state.requirements:
            base_desc += "\n\nRequirements:"
            for i, req in enumerate(state.requirements):
                base_desc += f"\n{i+1}. {req['description']}"
                
        # Add information about difficulty
        difficulty_desc = "easy"
        if state.difficulty > 0.3 and state.difficulty <= 0.6:
            difficulty_desc = "moderate"
        elif state.difficulty > 0.6 and state.difficulty <= 0.8:
            difficulty_desc = "challenging"
        elif state.difficulty > 0.8:
            difficulty_desc = "very challenging"
        
        base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task."
        
        return base_desc
    
    def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str:
        """
        Generate a description focused on remaining issues.
        
        Args:
            state: The problem state
            issues: The identified issues
            
        Returns:
            A descriptive prompt focused on remaining issues
        """
        base_desc = self._generate_description(state)
        
        # Add focus on remaining issues
        if issues:
            base_desc += "\n\nFocus on the following issues:"
            for i, issue in enumerate(issues):
                if issue["type"] == "test_failure":
                    base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}"
                else:
                    base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}"
        
        # Add focus areas if present
        if "focus_areas" in state.code_context:
            areas = state.code_context["focus_areas"]
            if areas:
                base_desc += f"\n\nPay particular attention to: {', '.join(areas)}."
        
        return base_desc
    
    def _generate_guided_description(
        self,
        state: ProblemState,
        issues: List[Dict[str, Any]],
        suggestions: List[Dict[str, Any]]
    ) -> str:
        """
        Generate a description with added guidance.
        
        Args:
            state: The problem state
            issues: The identified issues
            suggestions: The suggested fixes
            
        Returns:
            A descriptive prompt with added guidance
        """
        base_desc = self._generate_description(state)
        
        # Add detailed information about issues
        if issues:
            base_desc += "\n\nThe following issues were identified in your previous solution:"
            for i, issue in enumerate(issues):
                if issue["type"] == "test_failure":
                    base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}"
                    if "expected" in issue and "actual" in issue:
                        base_desc += f"\n   Expected: {issue['expected']}"
                        base_desc += f"\n   Actual: {issue['actual']}"
                else:
                    base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}"
                    if "location" in issue:
                        base_desc += f"\n   Location: {issue['location']}"
        
        # Add suggestions
        if suggestions:
            base_desc += "\n\nConsider the following suggestions:"
            for i, suggestion in enumerate(suggestions):
                base_desc += f"\n{i+1}. {suggestion['message']}"
        
        # Add hints if present
        if "hints" in state.code_context:
            hints = state.code_context["hints"]
            if hints:
                base_desc += "\n\nHints:"
                for i, hint in enumerate(hints):
                    base_desc += f"\n{i+1}. {hint}"
        
        return base_desc
    
    def _generate_hints(
        self,
        solution: str,
        result: EvaluationResult,
        feedback: Feedback
    ) -> List[str]:
        """
        Generate hints based on the solution and feedback.
        
        Args:
            solution: The solution code
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            List of hints
        """
        hints = []
        
        # Add hints based on failing tests
        if result.test_results:
            failing_tests = [
                test_name for test_name, test_result in result.test_results.items()
                if not test_result["passed"]
            ]
            
            if failing_tests:
                test_hint = "Focus on fixing the failing tests"
                
                # Add specific information about test expectations if available
                for test_name in failing_tests[:2]:  # Limit to first two tests
                    test_result = result.test_results[test_name]
                    if "expected" in test_result and "actual" in test_result:
                        test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'"
                
                hints.append(test_hint + ".")
        
        # Add hints based on errors
        if result.error_details:
            for error_type, error_info in result.error_details.items():
                hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.")
        
        # Add hints based on focus areas
        for area in feedback.focus_areas:
            if area == "syntax":
                hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.")
            elif area == "logic":
                hints.append("Review the logic of your solution, especially conditional statements and loop conditions.")
            elif area == "functionality":
                hints.append("Ensure your solution implements all required functionality specified in the tests.")
            elif area == "performance":
                hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.")
            elif area == "complexity":
                hints.append("Try to simplify your solution, it may be more complex than necessary.")
        
        return hints
    
    def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str:
        """
        Generate a hint for a specific failing test.
        
        Args:
            test_name: The name of the test
            test_result: The test result
            
        Returns:
            A hint for the test
        """
        if "expected" in test_result and "actual" in test_result:
            return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'"
        elif "message" in test_result:
            return test_result["message"]
        else:
            return "The test failed, but no detailed information is available."
    
    def _add_syntax_error(self, state: ProblemState, solution: str) -> None:
        """
        Add a syntax error to the solution code.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        lines = solution.split('\n')
        if not lines:
            return
        
        # Choose a line to modify
        idx = random.randint(0, len(lines) - 1)
        line = lines[idx]
        
        # Skip empty lines or comment lines
        while not line.strip() or line.strip().startswith('#'):
            idx = random.randint(0, len(lines) - 1)
            line = lines[idx]
        
        # Choose a modification type
        mod_type = random.choice([
            "remove_character",
            "add_character",
            "swap_characters",
            "change_indent"
        ])
        
        if mod_type == "remove_character" and line:
            char_idx = random.randint(0, len(line) - 1)
            lines[idx] = line[:char_idx] + line[char_idx+1:]
        
        elif mod_type == "add_character":
            char_idx = random.randint(0, len(line))
            char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."])
            lines[idx] = line[:char_idx] + char + line[char_idx:]
        
        elif mod_type == "swap_characters" and len(line) >= 2:
            char_idx = random.randint(0, len(line) - 2)
            lines[idx] = (line[:char_idx] + line[char_idx+1] + 
                         line[char_idx] + line[char_idx+2:])
        
        elif mod_type == "change_indent":
            # Either add or remove indentation
            if line.startswith("    "):
                lines[idx] = line[2:]  # Remove some indent
            else:
                lines[idx] = "  " + line  # Add inconsistent indent
        
        # Update the code
        modified_code = '\n'.join(lines)
        state.code_context["code"] = modified_code
        
        # Add information about the modification
        if "bugs" not in state.code_context:
            state.code_context["bugs"] = []
        
        state.code_context["bugs"].append({
            "type": "syntax",
            "line": idx + 1,
            "description": f"Syntax error introduced in line {idx + 1}"
        })
    
    def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
        """
        Add a logical error to the solution code.
        
        Args:
            state: The problem state to modify
            solution: The current solution
            parsed_solution: The parsed AST of the solution
        """
        modification_types = [
            "change_comparison",
            "invert_condition",
            "off_by_one",
            "change_operator",
            "reverse_logic"
        ]
        
        mod_type = random.choice(modification_types)
        lines = solution.split('\n')
        
        # Find all if statements and loops
        if_statements = []
        for i, line in enumerate(lines):
            if re.search(r'\bif\b|\bwhile\b|\bfor\b', line):
                if_statements.append((i, line))
        
        if if_statements:
            # Choose an if statement to modify
            idx, line = random.choice(if_
# recursive_swe_bench/task_generators/bug_fixing.py (continued)

        if if_statements:
            # Choose an if statement to modify
            idx, line = random.choice(if_statements)
            
            if mod_type == "change_comparison":
                # Change comparison operators
                comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="}
                for op, new_op in comparisons.items():
                    if op in line:
                        lines[idx] = line.replace(op, new_op, 1)
                        break
            
            elif mod_type == "invert_condition":
                # Add or remove a "not" to invert the condition
                if "not" in line:
                    lines[idx] = line.replace("not ", "", 1)
                else:
                    match = re.search(r'(if|while)\s+([^:]+):', line)
                    if match:
                        condition = match.group(2)
                        lines[idx] = line.replace(condition, f"not ({condition})", 1)
            
            elif mod_type == "off_by_one":
                # Introduce an off-by-one error
                for op in ["+", "-"]:
                    if op in line:
                        # If there's a number after the operator, change it
                        match = re.search(f'\\{op}\\s*(\\d+)', line)
                        if match:
                            num = int(match.group(1))
                            new_num = num + 1 if op == "+" else max(0, num - 1)
                            lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1)
                            break
            
            elif mod_type == "change_operator":
                # Change arithmetic or logical operators
                operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"}
                for op, new_op in operators.items():
                    if f" {op} " in line:
                        lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1)
                        break
            
            elif mod_type == "reverse_logic":
                # Reverse the logic of a compound condition
                if " and " in line:
                    parts = line.split(" and ")
                    lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1)
                elif " or " in line:
                    parts = line.split(" or ")
                    lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1)
        
        else:
            # If no if statements found, introduce a different kind of logical error
            # Find variable assignments
            assignments = []
            for i, line in enumerate(lines):
                if "=" in line and "==" not in line and "!=" not in line:
                    assignments.append((i, line))
            
            if assignments:
                # Choose an assignment to modify
                idx, line = random.choice(assignments)
                
                # Modify the assignment
                if "+" in line:
                    lines[idx] = line.replace("+", "-", 1)
                elif "-" in line:
                    lines[idx] = line.replace("-", "+", 1)
                elif "*" in line:
                    lines[idx] = line.replace("*", "/", 1)
                elif "/" in line:
                    lines[idx] = line.replace("/", "*", 1)
                else:
                    # If no arithmetic operator, change the value
                    match = re.search(r'=\s*(\d+)', line)
                    if match:
                        num = int(match.group(1))
                        new_num = num + random.choice([-1, 1]) * random.randint(1, 3)
                        lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1)
        
        # Update the code
        modified_code = '\n'.join(lines)
        state.code_context["code"] = modified_code
        
        # Add information about the modification
        if "bugs" not in state.code_context:
            state.code_context["bugs"] = []
        
        state.code_context["bugs"].append({
            "type": "logical",
            "line": idx + 1,
            "description": f"Logical error introduced in line {idx + 1}: {mod_type}"
        })
    
    def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
        """
        Add a performance issue to the solution code.
        
        Args:
            state: The problem state to modify
            solution: The current solution
            parsed_solution: The parsed AST of the solution
        """
        lines = solution.split('\n')
        
        # Find loops in the code
        loops = []
        for i, line in enumerate(lines):
            if re.search(r'\bfor\b|\bwhile\b', line):
                loops.append((i, line))
        
        if loops:
            # Choose a loop to modify
            idx, line = random.choice(loops)
            
            # Choose a modification type
            mod_type = random.choice([
                "add_nested_loop",
                "replace_efficient_operation",
                "add_redundant_computation"
            ])
            
            if mod_type == "add_nested_loop":
                # Add a nested loop
                indent = len(line) - len(line.lstrip())
                indent_str = ' ' * indent
                loop_body_indent = indent_str + '    '
                
                # Find the next line with the same indentation or less
                end_idx = idx + 1
                while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent):
                    end_idx += 1
                
                # Insert a nested loop before the end of the current loop
                insert_pos = end_idx
                lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100):  # Unnecessary loop")
                lines.insert(insert_pos + 1, f"{loop_body_indent}    pass")
            
            elif mod_type == "replace_efficient_operation":
                # Replace an efficient operation with a less efficient one
                # Look for list comprehensions or efficient operations
                for i in range(idx + 1, min(idx + 10, len(lines))):
                    if "append" in lines[i] or "extend" in lines[i]:
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        match = re.search(r'(\w+)\.(append|extend)', lines[i])
                        if match:
                            list_name = match.group(1)
                            operation = match.group(2)
                            item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0]
                            
                            if operation == "append":
                                # Replace append with concatenation
                                lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}]  # Less efficient than append"
                            elif operation == "extend":
                                # Replace extend with concatenation
                                lines[i] = f"{indent_str}{list_name} = {list_name} + {item}  # Less efficient than extend"
                            break
            
            elif mod_type == "add_redundant_computation":
                # Add redundant computation inside the loop
                # Find the indentation level of the loop body
                if idx + 1 < len(lines):
                    body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip())
                    body_indent_str = ' ' * body_indent
                    
                    # Add redundant computation
                    lines.insert(idx + 1, f"{body_indent_str}temp = []  # Redundant computation")
                    lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):")
                    lines.insert(idx + 3, f"{body_indent_str}    temp.append(i)")
                    lines.insert(idx + 4, f"{body_indent_str}    temp.sort()  # Unnecessary sort in each iteration")
        
        else:
            # If no loops found, introduce inefficient data structure or algorithm
            function_defs = []
            for i, line in enumerate(lines):
                if line.strip().startswith("def "):
                    function_defs.append((i, line))
            
            if function_defs:
                # Choose a function to modify
                idx, line = random.choice(function_defs)
                
                # Find the indentation level of the function body
                if idx + 1 < len(lines):
                    body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip())
                    body_indent_str = ' ' * body_indent
                    
                    # Add inefficient code at the beginning of the function
                    lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage")
                    lines.insert(idx + 2, f"{body_indent_str}data = []")
                    lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):")
                    lines.insert(idx + 4, f"{body_indent_str}    data.append(i)")
                    lines.insert(idx + 5, f"{body_indent_str}    # Inefficient search operation")
                    lines.insert(idx + 6, f"{body_indent_str}    if i in data:  # Linear search instead of using a set")
                    lines.insert(idx + 7, f"{body_indent_str}        pass")
        
        # Update the code
        modified_code = '\n'.join(lines)
        state.code_context["code"] = modified_code
        
        # Add information about the modification
        if "bugs" not in state.code_context:
            state.code_context["bugs"] = []
        
        state.code_context["bugs"].append({
            "type": "performance",
            "line": idx + 1,
            "description": f"Performance issue introduced around line {idx + 1}"
        })
    
    def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
        """
        Add an edge case issue to the solution code.
        
        Args:
            state: The problem state to modify
            solution: The current solution
            parsed_solution: The parsed AST of the solution
        """
        lines = solution.split('\n')
        
        # Find functions in the code
        functions = []
        current_func = None
        func_start = None
        for i, line in enumerate(lines):
            if line.strip().startswith("def "):
                if current_func:
                    functions.append((func_start, i - 1, current_func))
                current_func = line.strip()[4:].split("(")[0]
                func_start = i
            elif i == len(lines) - 1 and current_func:
                functions.append((func_start, i, current_func))
        
        if functions:
            # Choose a function to modify
            start_idx, end_idx, func_name = random.choice(functions)
            
            # Choose a modification type
            mod_type = random.choice([
                "remove_boundary_check",
                "introduce_zero_division",
                "handling_empty_input",
                "type_assumption"
            ])
            
            if mod_type == "remove_boundary_check":
                # Find and remove or modify boundary checks
                for i in range(start_idx, end_idx + 1):
                    if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]):
                        # Comment out the boundary check
                        lines[i] = f"# {lines[i]}  # Boundary check removed"
                        # Skip the body of the if statement
                        j = i + 1
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        body_indent = indent + 4
                        while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent):
                            lines[j] = f"# {lines[j]}"
                            j += 1
                        break
            
            elif mod_type == "introduce_zero_division":
                # Find division operations and modify them
                for i in range(start_idx, end_idx + 1):
                    if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]:
                        # Remove denominator check if it exists
                        if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]):
                            lines[i] = f"# {lines[i]}  # Denominator check removed"
                        else:
                            # Or modify a division to potentially cause zero division
                            match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i])
                            if match:
                                denominator = match.group(2)
                                # Add a potential zero value for the denominator
                                indent = len(lines[i]) - len(lines[i].lstrip())
                                indent_str = ' ' * indent
                                lines.insert(i, f"{indent_str}if random.random() < 0.1:  # Introduce potential zero division")
                                lines.insert(i + 1, f"{indent_str}    {denominator} = 0")
                                break
            
            elif mod_type == "handling_empty_input":
                # Modify parameter handling to not handle empty inputs correctly
                params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx])
                if params and params.group(1):
                    param_list = [p.strip() for p in params.group(1).split(",")]
                    if param_list:
                        param = param_list[0].split("=")[0].strip()
                        # Find checks for the parameter
                        for i in range(start_idx + 1, end_idx + 1):
                            if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]):
                                # Comment out the empty check
                                lines[i] = f"# {lines[i]}  # Empty input check removed"
                                # Skip the body of the if statement
                                j = i + 1
                                indent = len(lines[i]) - len(lines[i].lstrip())
                                body_indent = indent + 4
                                while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent):
                                    lines[j] = f"# {lines[j]}"
                                    j += 1
                                break
            
            elif mod_type == "type_assumption":
                # Introduce assumptions about parameter types
                params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx])
                if params and params.group(1):
                    param_list = [p.strip() for p in params.group(1).split(",")]
                    if param_list:
                        param = param_list[0].split("=")[0].strip()
                        # Find type checks for the parameter
                        type_check_found = False
                        for i in range(start_idx + 1, end_idx + 1):
                            if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]):
                                # Comment out the type check
                                lines[i] = f"# {lines[i]}  # Type check removed"
                                type_check_found = True
                                break
                        
                        if not type_check_found:
                            # Add a problematic type assumption
                            indent = 4  # Assume basic indentation
                            for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)):
                                if lines[i].strip():
                                    indent = len(lines[i]) - len(lines[i].lstrip())
                                    break
                            
                            indent_str = ' ' * indent
                            # Add code that assumes a specific type
                            lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking")
                            lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param})  # Will fail if {param} doesn't support len()")
        
        # Update the code
        modified_code = '\n'.join(lines)
        state.code_context["code"] = modified_code
        
        # Add information about the modification
        if "bugs" not in state.code_context:
            state.code_context["bugs"] = []
        
        state.code_context["bugs"].append({
            "type": "edge_case",
            "line": start_idx + 1,
            "description": f"Edge case issue introduced in function '{func_name}': {mod_type}"
        })
    
    def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]:
        """
        Generate a new requirement based on the current state and solution.
        
        Args:
            state: The current problem state
            solution: The current solution
            
        Returns:
            A new requirement dictionary
        """
        # Parse the solution to find functions and variables
        function_names = re.findall(r'def\s+(\w+)', solution)
        variable_names = re.findall(r'(\w+)\s*=', solution)
        
        # Choose a requirement type
        req_type = random.choice([
            "edge_case_handling",
            "performance_improvement",
            "error_handling",
            "type_checking",
            "feature_addition"
        ])
        
        if req_type == "edge_case_handling":
            if function_names:
                func_name = random.choice(function_names)
                edge_cases = [
                    "empty input",
                    "negative values",
                    "zero values",
                    "extremely large values",
                    "special characters",
                    "duplicate values"
                ]
                edge_case = random.choice(edge_cases)
                return {
                    "type": "edge_case_handling",
                    "description": f"The function '{func_name}' should handle {edge_case} correctly.",
                    "difficulty": random.uniform(0.3, 0.7)
                }
            
        elif req_type == "performance_improvement":
            return {
                "type": "performance_improvement",
                "description": "The solution should be optimized to run in O(n) time or better.",
                "difficulty": random.uniform(0.4, 0.8)
            }
            
        elif req_type == "error_handling":
            error_types = [
                "invalid input",
                "division by zero",
                "file not found",
                "network timeout",
                "permission denied"
            ]
            error_type = random.choice(error_types)
            return {
                "type": "error_handling",
                "description": f"The code should handle {error_type} errors gracefully.",
                "difficulty": random.uniform(0.2, 0.6)
            }
            
        elif req_type == "type_checking":
            if function_names:
                func_name = random.choice(function_names)
                return {
                    "type": "type_checking",
                    "description": f"The function '{func_name}' should validate input types before processing.",
                    "difficulty": random.uniform(0.1, 0.5)
                }
            
        elif req_type == "feature_addition":
            features = [
                "logging capability",
                "progress tracking",
                "caching for repeated operations",
                "parameter validation",
                "configuration options"
            ]
            feature = random.choice(features)
            return {
                "type": "feature_addition",
                "description": f"Add {feature} to the solution.",
                "difficulty": random.uniform(0.3, 0.7)
            }
        
        # Default requirement if none of the above were applicable
        return {
            "type": "general_improvement",
            "description": "Improve the overall code quality and readability.",
            "difficulty": random.uniform(0.1, 0.4)
        }
    
    def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]:
        """
        Modify an existing requirement to make it more challenging.
        
        Args:
            requirement: The requirement to modify
            state: The current problem state
            solution: The current solution
            
        Returns:
            The modified requirement
        """
        # Make a copy of the requirement
        modified_req = copy.deepcopy(requirement)
        
        # Increase the difficulty
        modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3))
        
        # Modify the description based on the requirement type
        if requirement["type"] == "edge_case_handling":
            modified_req["description"] += " Additionally, it should handle very large inputs efficiently."
        
        elif requirement["type"] == "performance_improvement":
            modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)")
        
        elif requirement["type"] == "error_handling":
            modified_req["description"] += " And provide detailed error messages for debugging."
        
        elif requirement["type"] == "type_checking":
            modified_req["description"] += " And automatically convert types when possible."
        
        elif requirement["type"] == "feature_addition":
            modified_req["description"] += " Ensure this feature is configurable via parameters."
        
        else:
            modified_req["description"] += " The code should also be well-documented with comments."
        
        return modified_req
    
    def _add_edge_case_requirement(self, state: ProblemState) -> None:
        """
        Add a requirement for handling edge cases.
        
        Args:
            state: The problem state to modify
        """
        edge_cases = [
            "empty collections",
            "null/None values",
            "boundary values (min/max)",
            "negative numbers",
            "special characters",
            "Unicode characters",
            "very large inputs",
            "malformed input"
        ]
        
        edge_case = random.choice(edge_cases)
        
        # Add a new requirement
        state.requirements.append({
            "type": "edge_case_handling",
            "description": f"The solution must correctly handle {edge_case}.",
            "difficulty": random.uniform(0.3, 0.7)
        })
        
        # Add test cases for the edge case if tests exist
        if "tests" in state.code_context:
            # Create a new test for the edge case
            test_template = self._generate_edge_case_test(edge_case, state.code_context)
            if test_template:
                state.code_context["tests"].append({
                    "name": f"test_edge_case_{len(state.code_context['tests'])}",
                    "content": test_template,
                    "description": f"Test handling of {edge_case}"
                })
    
    def _increase_data_volume(self, state: ProblemState, solution: str) -> None:
        """
        Modify the problem to require handling larger data volumes.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Add a requirement for handling large data
        state.requirements.append({
            "type": "scalability",
            "description": "The solution must efficiently handle large datasets (10,000+ items).",
            "difficulty": random.uniform(0.5, 0.8)
        })
        
        # Modify existing tests to use larger data if tests exist
        if "tests" in state.code_context:
            for i, test in enumerate(state.code_context["tests"]):
                content = test["content"]
                
                # Look for small lists or arrays in tests
                for pattern, replacement in [
                    (r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'),
                    (r'range\(\d+\)', 'range(10000)'),
                    (r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"')
                ]:
                    match = re.search(pattern, content)
                    if match and random.random() < 0.3:  # Only replace some instances
                        content = content.replace(match.group(0), replacement, 1)
                        break
                
                state.code_context["tests"][i]["content"] = content
                state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)"
    
    def _add_performance_constraint(self, state: ProblemState, solution: str) -> None:
        """
        Add a performance constraint to the problem.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Choose a performance constraint
        constraints = [
            "linear time complexity (O(n))",
            "logarithmic time complexity (O(log n))",
            "constant memory usage (O(1) space)",
            "execution time under 100ms for large inputs",
            "minimal function calls"
        ]
        
        constraint = random.choice(constraints)
        
        # Add a new requirement
        state.requirements.append({
            "type": "performance",
            "description": f"The solution must achieve {constraint}.",
            "difficulty": random.uniform(0.6, 0.9)
        })
        
        # Add performance testing code if tests exist
        if "tests" in state.code_context:
            # Add a performance test
            perf_test = self._generate_performance_test(constraint, state.code_context)
            if perf_test:
                state.code_context["tests"].append({
                    "name": f"test_performance_{len(state.code_context['tests'])}",
                    "content": perf_test,
                    "description": f"Test {constraint}"
                })
    
    def _expand_functionality(self, state: ProblemState, solution: str) -> None:
        """
        Expand the required functionality of the solution.
        
        Args:
            state: The problem state to modify
            solution: The current solution
        """
        # Choose a functionality expansion
        expansions = [
            "support for different input types",
            "parameterized behavior",
            "additional output formats",
            "flexible error handling",
            "integration with external systems"
        ]
        
        expansion = random.choice(expansions)
        
        # Add a new requirement
        state.requirements.append({
            "type": "functionality",
            "description": f"Expand the solution to include {expansion}.",
            "difficulty": random.uniform(0.4, 0.8)
        })
        
        # Add test cases for the new functionality if tests exist
        if "tests" in state.code_context:
            # Create a new test for the expanded functionality
            test_template = self._generate_functionality_test(expansion, state.code_context)
            if test_template:
                state.code_context["tests"].append({
                    "name": f"test_expanded_functionality_{len(state.code_context['tests'])}",
                    "content": test_template,
                    "description": f"Test {expansion}"
                })
    
    def _generate_default_test(self) -> str:
        """
        Generate a default test based on the current problem state.
        
        Returns:
            A default test script
        """
        # Generate a basic test script
        return """
import unittest
import sys
import os

# Add the directory containing the solution to the path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Import the solution
from solution import *

class DefaultTest(unittest.TestCase):
    def test_basic_functionality(self):
        # A basic test that should pass if the solution is correct
        self.assertTrue(True, "Basic assertion failed")
        
    def test_expected_output(self):
        # Test expected output of main functions
        # This will need to be updated based on the specific problem
        pass
        
if __name__ == '__main__':
    unittest.main()
"""
    
    def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str:
        """
        Generate a test for an edge case.
        
        Args:
            edge_case: The edge case to test
            code_context: The code context containing information about the problem
            
        Returns:
            A test script for the edge case
        """
        # Extract function names from the code context
        function_names = []
        if "code" in code_context:
            function_names = re.findall(r'def\s+(\w+)', code_context["code"])
        
        if not function_names:
            return None
        
        # Choose a function to test
        function_name = random.choice(function_names)
        
        # Generate test code based on the edge case
        if edge_case == "empty collections":
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class EmptyCollectionTest(unittest.TestCase):
    def test_empty_input(self):
        # Test with empty list
        result = {function_name}([])
        self.assertIsNotNone(result, "Function should handle empty list")
        
        # Test with empty string
        result = {function_name}("")
        self.assertIsNotNone(result, "Function should handle empty string")
        
        # Test with empty dict
        result = {function_name}({{}})
        self.assertIsNotNone(result, "Function should handle empty dict")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif edge_case == "null/None values":
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class NoneValueTest(unittest.TestCase):
    def test_none_input(self):
        # Test with None as input
        result = {function_name}(None)
        self.assertIsNotNone(result, "Function should handle None input")
        
        # Test with list containing None
        result = {function_name}([1, None, 3])
        self.assertIsNotNone(result, "Function should handle list with None values")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif edge_case == "boundary values (min/max)":
            return f"""
# recursive_swe_bench/task_generators/bug_fixing.py (completion)

import unittest
import sys
import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class BoundaryValueTest(unittest.TestCase):
    def test_min_max_values(self):
        # Test with minimum integer
        min_int = -sys.maxsize - 1
        result = {function_name}(min_int)
        self.assertIsNotNone(result, "Function should handle minimum integer")
        
        # Test with maximum integer
        max_int = sys.maxsize
        result = {function_name}(max_int)
        self.assertIsNotNone(result, "Function should handle maximum integer")
        
        # Test with very large list
        large_list = list(range(10000))
        result = {function_name}(large_list)
        self.assertIsNotNone(result, "Function should handle very large inputs")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif edge_case == "negative numbers":
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class NegativeNumberTest(unittest.TestCase):
    def test_negative_numbers(self):
        # Test with negative number
        result = {function_name}(-1)
        self.assertIsNotNone(result, "Function should handle negative numbers")
        
        # Test with list of negative numbers
        result = {function_name}([-1, -2, -3])
        self.assertIsNotNone(result, "Function should handle lists of negative numbers")
        
        # Test with mixed positive and negative
        result = {function_name}([-1, 0, 1])
        self.assertIsNotNone(result, "Function should handle mixed positive and negative")
        
if __name__ == '__main__':
    unittest.main()
"""
        else:
            # Generic edge case test
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class EdgeCaseTest(unittest.TestCase):
    def test_edge_case_{edge_case.replace(' ', '_')}(self):
        # Test edge case: {edge_case}
        # This is a placeholder test that needs to be customized for the specific edge case
        self.assertTrue(True, "Edge case test not implemented")
        
if __name__ == '__main__':
    unittest.main()
"""
    
    def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str:
        """
        Generate a performance test based on a constraint.
        
        Args:
            constraint: The performance constraint
            code_context: The code context containing information about the problem
            
        Returns:
            A test script for the performance constraint
        """
        # Extract function names from the code context
        function_names = []
        if "code" in code_context:
            function_names = re.findall(r'def\s+(\w+)', code_context["code"])
        
        if not function_names:
            return None
        
        # Choose a function to test
        function_name = random.choice(function_names)
        
        if "time complexity" in constraint:
            return f"""
import unittest
import sys
import os
import time
import random

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class PerformanceTest(unittest.TestCase):
    def test_time_complexity(self):
        # Test for {constraint}
        sizes = [100, 1000, 10000]
        times = []
        
        for size in sizes:
            # Generate input of the given size
            input_data = [random.randint(0, 1000) for _ in range(size)]
            
            # Measure execution time
            start_time = time.time()
            {function_name}(input_data)
            end_time = time.time()
            
            times.append(end_time - start_time)
        
        # Check if time grows appropriately
        # For O(n), time should grow linearly with input size
        # For O(log n), time should grow logarithmically
        # This is a simplified check and might need adjustment
        if "log n" in "{constraint}":
            # For logarithmic time, the ratio of times should decrease
            ratio1 = times[1] / times[0]
            ratio2 = times[2] / times[1]
            self.assertLess(ratio2, ratio1 * 1.5, 
                           f"Growth rate appears super-logarithmic: {times}")
        else:  # Assume linear or better
            # For linear time, the ratio of times should be roughly equal to ratio of sizes
            ratio1 = times[1] / times[0]
            size_ratio1 = sizes[1] / sizes[0]
            
            ratio2 = times[2] / times[1]
            size_ratio2 = sizes[2] / sizes[1]
            
            self.assertLess(ratio1, size_ratio1 * 1.5, 
                           f"First growth rate appears super-linear: {times}")
            self.assertLess(ratio2, size_ratio2 * 1.5, 
                           f"Second growth rate appears super-linear: {times}")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "execution time" in constraint:
            return f"""
import unittest
import sys
import os
import time
import random

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class PerformanceTest(unittest.TestCase):
    def test_execution_time(self):
        # Test for {constraint}
        # Generate a large input
        input_data = [random.randint(0, 1000) for _ in range(10000)]
        
        # Measure execution time
        start_time = time.time()
        {function_name}(input_data)
        end_time = time.time()
        
        execution_time = (end_time - start_time) * 1000  # Convert to ms
        
        self.assertLess(execution_time, 100, 
                       f"Execution time exceeded 100ms: {execution_time:.2f}ms")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "memory usage" in constraint:
            return f"""
import unittest
import sys
import os
import psutil
import random

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class MemoryUsageTest(unittest.TestCase):
    def test_memory_usage(self):
        # Test for {constraint}
        # Note: This is an approximate test and may not be accurate in all environments
        
        # Get current process
        process = psutil.Process(os.getpid())
        
        # Measure memory before
        memory_before = process.memory_info().rss / 1024 / 1024  # MB
        
        # Generate a large input
        input_data = [random.randint(0, 1000) for _ in range(100000)]
        
        # Run function
        {function_name}(input_data)
        
        # Measure memory after
        memory_after = process.memory_info().rss / 1024 / 1024  # MB
        
        # Calculate memory usage
        memory_used = memory_after - memory_before
        
        # A crude approximation, adjust as needed
        self.assertLess(memory_used, 10, 
                       f"Memory usage seems high: {memory_used:.2f}MB")
        
if __name__ == '__main__':
    unittest.main()
"""
        else:
            # Generic performance test
            return f"""
import unittest
import sys
import os
import time
import random

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class PerformanceTest(unittest.TestCase):
    def test_performance(self):
        # Test for {constraint}
        # This is a placeholder test that needs to be customized for the specific constraint
        
        # Generate a large input
        input_data = [random.randint(0, 1000) for _ in range(10000)]
        
        # Measure execution time
        start_time = time.time()
        {function_name}(input_data)
        end_time = time.time()
        
        execution_time = end_time - start_time
        
        # Just log the time for now
        print(f"Execution time: {execution_time:.4f} seconds")
        self.assertTrue(True, "Performance test completed")
        
if __name__ == '__main__':
    unittest.main()
"""
    
    def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str:
        """
        Generate a test for expanded functionality.
        
        Args:
            expansion: The functionality expansion
            code_context: The code context containing information about the problem
            
        Returns:
            A test script for the expanded functionality
        """
        # Extract function names from the code context
        function_names = []
        if "code" in code_context:
            function_names = re.findall(r'def\s+(\w+)', code_context["code"])
        
        if not function_names:
            return None
        
        # Choose a function to test
        function_name = random.choice(function_names)
        
        if "different input types" in expansion:
            return f"""
import unittest
import sys
import os
import json
from collections import namedtuple

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class InputTypesTest(unittest.TestCase):
    def test_different_input_types(self):
        # Test with different types of inputs
        
        # Test with list
        list_input = [1, 2, 3]
        list_result = {function_name}(list_input)
        self.assertIsNotNone(list_result, "Function should handle list input")
        
        # Test with tuple
        tuple_input = (1, 2, 3)
        tuple_result = {function_name}(tuple_input)
        self.assertIsNotNone(tuple_result, "Function should handle tuple input")
        
        # Test with set
        set_input = {{1, 2, 3}}
        set_result = {function_name}(set_input)
        self.assertIsNotNone(set_result, "Function should handle set input")
        
        # Test with dictionary
        dict_input = {{"a": 1, "b": 2, "c": 3}}
        dict_result = {function_name}(dict_input)
        self.assertIsNotNone(dict_result, "Function should handle dictionary input")
        
        # Test with JSON string
        json_input = '{{"data": [1, 2, 3]}}'
        json_result = {function_name}(json_input)
        self.assertIsNotNone(json_result, "Function should handle JSON string")
        
        # Test with custom object
        Point = namedtuple('Point', ['x', 'y'])
        obj_input = Point(1, 2)
        obj_result = {function_name}(obj_input)
        self.assertIsNotNone(obj_result, "Function should handle custom object")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "parameterized behavior" in expansion:
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class ParameterizedTest(unittest.TestCase):
    def test_parameterized_behavior(self):
        # Test function with different parameters
        
        # Base case with default parameters
        base_input = [1, 2, 3]
        base_result = {function_name}(base_input)
        
        # The function should now accept additional parameters
        # These are example parameters, adjust based on the specific function
        
        # With sorting parameter
        try:
            sorted_result = {function_name}(base_input, sort=True)
            self.assertIsNotNone(sorted_result, "Function should handle sort parameter")
        except TypeError as e:
            self.fail(f"Function does not support sort parameter: {{e}}")
        
        # With filtering parameter
        try:
            filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1)
            self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter")
        except TypeError as e:
            self.fail(f"Function does not support filter_fn parameter: {{e}}")
        
        # With formatting parameter
        try:
            formatted_result = {function_name}(base_input, format="json")
            self.assertIsNotNone(formatted_result, "Function should handle format parameter")
        except TypeError as e:
            self.fail(f"Function does not support format parameter: {{e}}")
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "additional output formats" in expansion:
            return f"""
import unittest
import sys
import os
import json

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class OutputFormatsTest(unittest.TestCase):
    def test_output_formats(self):
        # Test function with different output formats
        input_data = [1, 2, 3]
        
        # Original format
        original_result = {function_name}(input_data)
        
        # The function should now support different output formats
        # These are example formats, adjust based on the specific function
        
        # JSON format
        try:
            json_result = {function_name}(input_data, format="json")
            # Check if it's valid JSON
            try:
                json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result
                self.assertIsNotNone(json_obj, "JSON result should be valid")
            except json.JSONDecodeError:
                self.fail("JSON result is not valid")
        except TypeError as e:
            self.fail(f"Function does not support JSON format: {{e}}")
        
        # CSV format
        try:
            csv_result = {function_name}(input_data, format="csv")
            self.assertIsNotNone(csv_result, "CSV result should not be None")
            if isinstance(csv_result, str):
                self.assertIn(",", csv_result, "CSV result should contain commas")
        except TypeError as e:
            self.fail(f"Function does not support CSV format: {{e}}")
        
        # XML format
        try:
            xml_result = {function_name}(input_data, format="xml")
            self.assertIsNotNone(xml_result, "XML result should not be None")
            if isinstance(xml_result, str):
                self.assertIn("<", xml_result, "XML result should contain tags")
                self.assertIn(">", xml_result, "XML result should contain tags")
        except TypeError as e:
            self.fail(f"Function does not support XML format: {{e}}")
        
if __name__ == '__main__':
    unittest.main()
"""
        else:
            # Generic functionality expansion test
            return f"""
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}

class ExpandedFunctionalityTest(unittest.TestCase):
    def test_expanded_functionality(self):
        # Test for {expansion}
        # This is a placeholder test that needs to be customized for the specific expansion
        
        # Basic test to verify the function exists
        input_data = [1, 2, 3]
        result = {function_name}(input_data)
        self.assertIsNotNone(result, "Function should return a result")
        
        # You need to add specific tests for the expanded functionality
        
if __name__ == '__main__':
    unittest.main()
"""
    
    def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]:
        """
        Calculate an adaptation vector based on the solution, result, and feedback.
        
        The adaptation vector encodes how the problem should evolve in future iterations,
        capturing dimensions like difficulty, bug type emphasis, and feedback focus.
        
        Args:
            solution: The current solution
            result: The evaluation results
            feedback: The feedback provided
            
        Returns:
            An adaptation vector (list of floats)
        """
        # Initialize adaptation vector with zeros
        # Dimensions:
        # [0] - difficulty adjustment
        # [1] - syntax vs logical bug emphasis
        # [2] - performance focus
        # [3] - edge case focus
        # [4] - requirement expansion
        adaptation_vector = [0.0] * 5
        
        # Adjust difficulty based on score
        if result.score > 0.95:
            adaptation_vector[0] = 0.2  # Increase difficulty significantly
        elif result.score > 0.8:
            adaptation_vector[0] = 0.1  # Increase difficulty moderately
        elif result.score > 0.6:
            adaptation_vector[0] = 0.0  # Maintain current difficulty
        elif result.score > 0.4:
            adaptation_vector[0] = -0.1  # Decrease difficulty moderately
        else:
            adaptation_vector[0] = -0.2  # Decrease difficulty significantly
        
        # Adjust bug type emphasis based on error types
        syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax")
        logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure")
        
        if syntax_issues > logical_issues:
            adaptation_vector[1] = -0.1  # Move toward more logical bugs
        elif logical_issues > syntax_issues:
            adaptation_vector[1] = 0.1  # Move toward more syntax bugs
        
        # Adjust performance focus based on execution time and metrics
        if result.metrics and "execution_time" in result.metrics:
            if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0):
                adaptation_vector[2] = 0.2  # Increase performance focus
            else:
                adaptation_vector[2] = -0.1  # Decrease performance focus
        
        # Adjust edge case focus based on test failures
        if result.test_results:
            edge_case_failures = sum(1 for test_name, test_result in result.test_results.items()
                                    if not test_result["passed"] and "edge" in test_name.lower())
            if edge_case_failures > 0:
                adaptation_vector[3] = 0.2  # Increase edge case focus
            else:
                adaptation_vector[3] = 0.0  # Maintain current edge case focus
        
        # Adjust requirement expansion based on current state
        current_requirements = len(self.state.requirements)
        if current_requirements < 3:
            adaptation_vector[4] = 0.1  # Increase likelihood of adding requirements
        elif current_requirements >= 5:
            adaptation_vector[4] = -0.1  # Decrease likelihood of adding requirements
        
        return adaptation_vector


class DefaultTestRunner:
    """Default test runner for evaluating bug fixes."""
    
    def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]:
        """
        Run tests against a solution file.
        
        Args:
            solution_file: Path to the solution file
            test_files: List of test file paths
            code_context: Context information about the code
            
        Returns:
            Dictionary of test results
        """
        # Initialize results
        results = {
            "all_passed": True,
            "passed_tests": 0,
            "total_tests": 0,
            "tests": {},
            "execution": {
                "success": True,
                "error": None,
                "stdout": None,
                "stderr": None
            },
            "execution_time": 0.0
        }
        
        # Import the solution to check for syntax errors
        try:
            # Check if the solution file exists
            if not solution_file.exists():
                results["execution"]["success"] = False
                results["execution"]["error"] = "Solution file not found"
                results["all_passed"] = False
                return results
            
            # Try to import the module to test for syntax errors
            sys.path.insert(0, str(solution_file.parent))
            import importlib.util
            spec = importlib.util.spec_from_file_location("solution", solution_file)
            solution_module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(solution_module)
            
            # Check for required functions
            if "required_functions" in code_context:
                for func_name in code_context["required_functions"]:
                    if not hasattr(solution_module, func_name):
                        results["execution"]["success"] = False
                        results["execution"]["error"] = f"Required function '{func_name}' not found"
                        results["all_passed"] = False
                        return results
            
        except Exception as e:
            results["execution"]["success"] = False
            results["execution"]["error"] = str(e)
            results["all_passed"] = False
            return results
        
        # Run each test file
        for test_file in test_files:
            # Skip if the test file doesn't exist
            if not test_file.exists():
                continue
            
            # Run the test file
            import unittest
            import io
            from contextlib import redirect_stdout, redirect_stderr
            
            # Create a test loader and find tests in the file
            loader = unittest.TestLoader()
            try:
                tests = loader.discover(str(test_file.parent), pattern=test_file.name)
                
                # Count the number of test cases
                test_cases = 0
                for suite in tests:
                    for test_case in suite:
                        test_cases += test_case.countTestCases()
                
                results["total_tests"] += test_cases
                
                # Run the tests
                runner = unittest.TextTestRunner(verbosity=2)
                
                # Capture stdout and stderr
                stdout_buffer = io.StringIO()
                stderr_buffer = io.StringIO()
                
                with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
                    test_result = runner.run(tests)
                
                stdout = stdout_buffer.getvalue()
                stderr = stderr_buffer.getvalue()
                
                # Check if all tests passed
                if not test_result.wasSuccessful():
                    results["all_passed"] = False
                
                # Count passed tests
                passed_tests = test_cases - len(test_result.failures) - len(test_result.errors)
                results["passed_tests"] += passed_tests
                
                # Store individual test results
                test_name = test_file.stem
                results["tests"][test_name] = {
                    "passed": test_result.wasSuccessful(),
                    "failures": len(test_result.failures),
                    "errors": len(test_result.errors),
                    "skipped": len(test_result.skipped),
                    "total": test_cases,
                    "passed_count": passed_tests,
                    "stdout": stdout,
                    "stderr": stderr
                }
                
                # Extract more detailed information about failures
                for failure in test_result.failures:
                    test_id = failure[0].id()
                    failure_message = failure[1]
                    
                    # Extract expected and actual values if available
                    import re
                    expected_match = re.search(r'Expected\s*:(.+)', failure_message)
                    actual_match = re.search(r'Actual\s*:(.+)', failure_message)
                    
                    expected = expected_match.group(1).strip() if expected_match else None
                    actual = actual_match.group(1).strip() if actual_match else None
                    
                    if test_id not in results["tests"]:
                        results["tests"][test_id] = {}
                    
                    results["tests"][test_id].update({
                        "passed": False,
                        "message": failure_message,
                        "expected": expected,
                        "actual": actual
                    })
                
            except Exception as e:
                # If the test file itself has errors
                results["all_passed"] = False
                results["tests"][test_file.stem] = {
                    "passed": False,
                    "error": str(e),
                    "failures": 1,
                    "errors": 1,
                    "skipped": 0,
                    "total": 1,
                    "passed_count": 0
                }
                results["total_tests"] += 1
        
        return results


class BugFixingTaskGenerator:
    """Generator for bug fixing tasks."""
    
    def __init__(self, config: Dict[str, Any] = None):
        """
        Initialize the bug fixing task generator.
        
        Args:
            config: Configuration options
        """
        self.config = config or {}
        self.difficulty_levels = self.config.get(
            "difficulty_levels", 
            ["easy", "medium", "hard", "expert"]
        )
        self.bug_categories = self.config.get(
            "bug_categories",
            [
                BugCategory.SYNTAX,
                BugCategory.LOGICAL,
                BugCategory.EDGE_CASE,
                BugCategory.PERFORMANCE
            ]
        )
        self.test_templates = self._load_test_templates()
    
    def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask:
        """
        Generate a new bug fixing task.
        
        Args:
            difficulty: The difficulty level (easy, medium, hard, expert)
            bug_categories: List of bug categories to include
            
        Returns:
            A new bug fixing task
        """
        # Choose difficulty if not specified
        if difficulty is None:
            difficulty = random.choice(self.difficulty_levels)
        
        # Choose bug categories if not specified
        if bug_categories is None:
            num_categories = random.randint(1, 3)
            bug_categories = random.sample(self.bug_categories, num_categories)
        
        # Generate a problem based on difficulty and bug categories
        problem_state = self._generate_problem_state(difficulty, bug_categories)
        
        # Create config for the task
        task_config = {
            "difficulty": difficulty,
            "bug_categories": bug_categories,
            "convergence_criteria": {
                "score_threshold": 0.95,
                "min_iterations": 1,
                "max_iterations": self.config.get("max_iterations", 5),
                "score_delta_threshold": 0.05,
                "consecutive_plateau_limit": 2
            },
            "score_weights": {
                "test": 0.7,
                "execution": 0.3
            },
            "performance_threshold": 1.0,
            "complexity_threshold": 0.7
        }
        
        # Create and return the task
        return BugFixingTask(problem_state, task_config)
    
    def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState:
        """
        Generate a problem state for the given difficulty and bug categories.
        
        Args:
            difficulty: The difficulty level
            bug_categories: List of bug categories
            
        Returns:
            A problem state for the task
        """
        # Choose a template based on difficulty and bug categories
        template = self._choose_template(difficulty, bug_categories)
        
        # Create a copy of the template
        problem_state = copy.deepcopy(template)
        
        # Generate a unique ID
        problem_state.problem_id = str(uuid.uuid4())
        
        # Initialize evolution stage and adaptation vector
        problem_state.evolution_stage = 0
        problem_state.adaptation_vector = [0.0] * 5
        
        # Adjust difficulty value based on level
        difficulty_values = {
            "easy": 0.25,
            "medium": 0.5,
            "hard": 0.75,
            "expert": 0.9
        }
        problem_state.difficulty = difficulty_values.get(difficulty, 0.5)
        
        # Insert bugs based on categories
        for category in bug_categories:
            self._insert_bug(problem_state, category)
        
        # Update description to reflect the current state
        problem_state.description = self._generate_description(problem_state)
        
        return problem_state
    
    def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState:
        """
        Choose a template that matches the difficulty and bug categories.
        
        Args:
            difficulty: The difficulty level
            bug_categories: List of bug categories
            
        Returns:
            A template problem state
        """
        # In a real implementation, this would load from a database of templates
        # For now, we'll generate a simple template
        
        # Generate code context with a sample function
        code = self._generate_template_code(difficulty, bug_categories)
        tests = self._generate_template_tests(code)
        
        # Create a basic problem state
        return ProblemState(
            problem_id="template",
            description="Fix the bugs in the given code.",
            code_context={
                "code": code,
                "tests": tests,
                "bug_count": 0,
                "bug_categories": []
            },
            requirements=[
                {
                    "type": "functional",
                    "description": "The code should pass all the provided tests.",
                    "difficulty": 0.3
                }
            ],
            difficulty=0.5,  # Will be overridden
            evolution_stage=0,
            adaptation_vector=[0.0] * 5
        )
    
    def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str:
        """
        Generate template code based on difficulty and bug categories.
        
        Args:
            difficulty: The difficulty level
            bug_categories: List of bug categories
            
        Returns:
            Template code
        """
        # For demonstration, we'll use a few predefined templates
        templates = {
            "easy": """
def calculate_sum(numbers):
    \"\"\"Calculate the sum of a list of numbers.\"\"\"
    total = 0
    for num in numbers:
        total += num
    return total

def calculate_average(numbers):
    \"\"\"Calculate the average of a list of numbers.\"\"\"
    if not numbers:
        return 0
    return calculate_sum(numbers) / len(numbers)
""",
            "medium": """
def find_most_frequent(items):
    \"\"\"Find the most frequently occurring item in
# recursive_swe_bench/task_generators/bug_fixing.py (template generation)

def find_most_frequent(items):
    """Find the most frequently occurring item in a list."""
    if not items:
        return None
    
    counts = {}
    for item in items:
        if item in counts:
            counts[item] += 1
        else:
            counts[item] = 1
    
    max_count = 0
    max_item = None
    for item, count in counts.items():
        if count > max_count:
            max_count = count
            max_item = item
    
    return max_item

def binary_search(sorted_list, target):
    """Perform binary search on a sorted list."""
    left = 0
    right = len(sorted_list) - 1
    
    while left <= right:
        mid = (left + right) // 2
        if sorted_list[mid] == target:
            return mid
        elif sorted_list[mid] < target:
            left = mid + 1
        else:
            right = mid - 1
    
    return -1  # Target not found
""",
            "hard": """
def merge_sort(arr):
    """Sort an array using the merge sort algorithm."""
    if len(arr) <= 1:
        return arr
    
    # Split the array into two halves
    mid = len(arr) // 2
    left_half = arr[:mid]
    right_half = arr[mid:]
    
    # Recursively sort both halves
    left_half = merge_sort(left_half)
    right_half = merge_sort(right_half)
    
    # Merge the sorted halves
    return merge(left_half, right_half)

def merge(left, right):
    """Merge two sorted arrays."""
    result = []
    i = j = 0
    
    # Compare elements from both arrays and add the smaller one to the result
    while i < len(left) and j < len(right):
        if left[i] <= right[j]:
            result.append(left[i])
            i += 1
        else:
            result.append(right[j])
            j += 1
    
    # Add any remaining elements
    result.extend(left[i:])
    result.extend(right[j:])
    
    return result

def quicksort(arr):
    """Sort an array using the quicksort algorithm."""
    if len(arr) <= 1:
        return arr
    
    # Choose the pivot (using the first element for simplicity)
    pivot = arr[0]
    
    # Partition the array
    less = [x for x in arr[1:] if x <= pivot]
    greater = [x for x in arr[1:] if x > pivot]
    
    # Recursively sort the partitions and combine
    return quicksort(less) + [pivot] + quicksort(greater)
""",
            "expert": """
class Node:
    """Node in a binary tree."""
    def __init__(self, value):
        self.value = value
        self.left = None
        self.right = None

def build_binary_tree(values):
    """Build a binary tree from a list of values."""
    if not values:
        return None
    
    root = Node(values[0])
    queue = [root]
    i = 1
    
    while queue and i < len(values):
        node = queue.pop(0)
        
        # Add left child
        if i < len(values) and values[i] is not None:
            node.left = Node(values[i])
            queue.append(node.left)
        i += 1
        
        # Add right child
        if i < len(values) and values[i] is not None:
            node.right = Node(values[i])
            queue.append(node.right)
        i += 1
    
    return root

def is_balanced(root):
    """Check if a binary tree is balanced."""
    def height(node):
        if not node:
            return 0
        return max(height(node.left), height(node.right)) + 1
    
    def is_balanced_helper(node):
        if not node:
            return True
        
        left_height = height(node.left)
        right_height = height(node.right)
        
        if abs(left_height - right_height) > 1:
            return False
        
        return is_balanced_helper(node.left) and is_balanced_helper(node.right)
    
    return is_balanced_helper(root)

def find_lca(root, p, q):
    """Find the lowest common ancestor of two nodes in a binary tree."""
    if not root:
        return None
    
    if root.value == p or root.value == q:
        return root
    
    left_lca = find_lca(root.left, p, q)
    right_lca = find_lca(root.right, p, q)
    
    if left_lca and right_lca:
        return root
    
    return left_lca if left_lca else right_lca
"""
        }
        
        # Choose a template based on difficulty
        if difficulty in templates:
            return templates[difficulty]
        else:
            return templates["medium"]  # Default to medium if difficulty not found
    
    def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]:
        """
        Generate template tests based on the code.
        
        Args:
            code: The template code
            
        Returns:
            List of test dictionaries
        """
        # Extract function names from the code
        function_names = re.findall(r'def\s+(\w+)', code)
        
        # Generate tests for each function
        tests = []
        for func_name in function_names:
            test_content = self._generate_test_for_function(func_name)
            if test_content:
                tests.append({
                    "name": f"test_{func_name}",
                    "content": test_content,
                    "description": f"Test for {func_name} function"
                })
        
        return tests
    
    def _generate_test_for_function(self, func_name: str) -> str:
        """
        Generate a test for a specific function.
        
        Args:
            func_name: The name of the function to test
            
        Returns:
            Test content
        """
        # Check if we have a template for this function
        if func_name in self.test_templates:
            return self.test_templates[func_name]
        
        # Generate a basic test based on the function name
        if "sum" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_sum

class TestCalculateSum(unittest.TestCase):
    def test_calculate_sum(self):
        self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15)
        self.assertEqual(calculate_sum([]), 0)
        self.assertEqual(calculate_sum([-1, -2, -3]), -6)
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "average" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_average

class TestCalculateAverage(unittest.TestCase):
    def test_calculate_average(self):
        self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3)
        self.assertEqual(calculate_average([]), 0)
        self.assertEqual(calculate_average([10]), 10)
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "frequent" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import find_most_frequent

class TestFindMostFrequent(unittest.TestCase):
    def test_find_most_frequent(self):
        self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3)
        self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a')
        self.assertIsNone(find_most_frequent([]))
        self.assertEqual(find_most_frequent([5]), 5)
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "search" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import binary_search

class TestBinarySearch(unittest.TestCase):
    def test_binary_search(self):
        self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2)
        self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0)
        self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4)
        self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1)
        self.assertEqual(binary_search([], 5), -1)
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "sort" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {0}

class Test{1}(unittest.TestCase):
    def test_sorting(self):
        self.assertEqual({0}([]), [])
        self.assertEqual({0}([1]), [1])
        self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9])
        self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9])
        self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1])
        
if __name__ == '__main__':
    unittest.main()
""".format(func_name, func_name.title())
        elif "balanced" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, is_balanced

class TestIsBalanced(unittest.TestCase):
    def test_is_balanced(self):
        # Create a balanced tree
        #      1
        #    /   \\
        #   2     3
        #  / \\   / \\
        # 4   5 6   7
        root = Node(1)
        root.left = Node(2)
        root.right = Node(3)
        root.left.left = Node(4)
        root.left.right = Node(5)
        root.right.left = Node(6)
        root.right.right = Node(7)
        self.assertTrue(is_balanced(root))
        
        # Create an unbalanced tree
        #      1
        #    /   \\
        #   2     3
        #  / \\
        # 4   5
        #/
        #6
        root = Node(1)
        root.left = Node(2)
        root.right = Node(3)
        root.left.left = Node(4)
        root.left.right = Node(5)
        root.left.left.left = Node(6)
        self.assertFalse(is_balanced(root))
        
        # Empty tree is balanced
        self.assertTrue(is_balanced(None))
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "lca" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, find_lca

class TestFindLCA(unittest.TestCase):
    def test_find_lca(self):
        # Create a tree
        #      1
        #    /   \\
        #   2     3
        #  / \\   / \\
        # 4   5 6   7
        root = Node(1)
        root.left = Node(2)
        root.right = Node(3)
        root.left.left = Node(4)
        root.left.right = Node(5)
        root.right.left = Node(6)
        root.right.right = Node(7)
        
        # Test cases
        self.assertEqual(find_lca(root, 4, 5).value, 2)  # LCA of 4 and 5 is 2
        self.assertEqual(find_lca(root, 4, 6).value, 1)  # LCA of 4 and 6 is 1
        self.assertEqual(find_lca(root, 3, 7).value, 3)  # LCA of 3 and 7 is 3
        self.assertEqual(find_lca(root, 2, 7).value, 1)  # LCA of 2 and 7 is 1
        
if __name__ == '__main__':
    unittest.main()
"""
        elif "tree" in func_name.lower():
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, build_binary_tree

class TestBuildBinaryTree(unittest.TestCase):
    def test_build_binary_tree(self):
        # Test empty list
        self.assertIsNone(build_binary_tree([]))
        
        # Test single node
        root = build_binary_tree([1])
        self.assertEqual(root.value, 1)
        self.assertIsNone(root.left)
        self.assertIsNone(root.right)
        
        # Test complete tree
        #      1
        #    /   \\
        #   2     3
        #  / \\   / \\
        # 4   5 6   7
        values = [1, 2, 3, 4, 5, 6, 7]
        root = build_binary_tree(values)
        self.assertEqual(root.value, 1)
        self.assertEqual(root.left.value, 2)
        self.assertEqual(root.right.value, 3)
        self.assertEqual(root.left.left.value, 4)
        self.assertEqual(root.left.right.value, 5)
        self.assertEqual(root.right.left.value, 6)
        self.assertEqual(root.right.right.value, 7)
        
        # Test tree with None values
        #      1
        #    /   \\
        #   2     3
        #  /     / 
        # 4     6   
        values = [1, 2, 3, 4, None, 6, None]
        root = build_binary_tree(values)
        self.assertEqual(root.value, 1)
        self.assertEqual(root.left.value, 2)
        self.assertEqual(root.right.value, 3)
        self.assertEqual(root.left.left.value, 4)
        self.assertIsNone(root.left.right)
        self.assertEqual(root.right.left.value, 6)
        self.assertIsNone(root.right.right)
        
if __name__ == '__main__':
    unittest.main()
"""
        else:
            # Generic test template
            return """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {0}

class Test{1}(unittest.TestCase):
    def test_{0}(self):
        # TODO: Add specific test cases for {0}
        # This is a placeholder test
        self.assertTrue(True)
        
if __name__ == '__main__':
    unittest.main()
""".format(func_name, func_name.title())
    
    def _load_test_templates(self) -> Dict[str, str]:
        """
        Load test templates for common functions.
        
        Returns:
            Dictionary of test templates
        """
        # In a real implementation, these would be loaded from files
        return {
            "calculate_sum": """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_sum

class TestCalculateSum(unittest.TestCase):
    def test_calculate_sum(self):
        self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15)
        self.assertEqual(calculate_sum([]), 0)
        self.assertEqual(calculate_sum([-1, -2, -3]), -6)
        
if __name__ == '__main__':
    unittest.main()
""",
            "calculate_average": """
import unittest
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_average

class TestCalculateAverage(unittest.TestCase):
    def test_calculate_average(self):
        self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3)
        self.assertEqual(calculate_average([]), 0)
        self.assertEqual(calculate_average([10]), 10)
        
if __name__ == '__main__':
    unittest.main()
"""
        }
    
    def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None:
        """
        Insert a bug of the specified category into the problem state.
        
        Args:
            problem_state: The problem state to modify
            bug_category: The category of bug to insert
        """
        if "code" not in problem_state.code_context:
            return
        
        # Parse the code to find potential bug insertion points
        code = problem_state.code_context["code"]
        try:
            parsed_code = ast.parse(code)
        except SyntaxError:
            # If the code already has syntax errors, don't add more bugs
            return
        
        # Insert different types of bugs based on the category
        if bug_category == BugCategory.SYNTAX:
            self._insert_syntax_bug(problem_state)
        elif bug_category == BugCategory.LOGICAL:
            self._insert_logical_bug(problem_state)
        elif bug_category == BugCategory.PERFORMANCE:
            self._insert_performance_bug(problem_state)
        elif bug_category == BugCategory.EDGE_CASE:
            self._insert_edge_case_bug(problem_state)
        else:
            # Default to logical bug
            self._insert_logical_bug(problem_state)
        
        # Update bug count and categories
        if "bug_count" not in problem_state.code_context:
            problem_state.code_context["bug_count"] = 0
        problem_state.code_context["bug_count"] += 1
        
        if "bug_categories" not in problem_state.code_context:
            problem_state.code_context["bug_categories"] = []
        if bug_category not in problem_state.code_context["bug_categories"]:
            problem_state.code_context["bug_categories"].append(bug_category)
    
    def _insert_syntax_bug(self, problem_state: ProblemState) -> None:
        """
        Insert a syntax bug into the problem state.
        
        Args:
            problem_state: The problem state to modify
        """
        code = problem_state.code_context["code"]
        lines = code.split('\n')
        if not lines:
            return
        
        # Choose a non-empty line to modify
        idx = random.randint(0, len(lines) - 1)
        line = lines[idx]
        
        # Skip empty lines or comment lines
        attempts = 0
        while (not line.strip() or line.strip().startswith('#')) and attempts < 10:
            idx = random.randint(0, len(lines) - 1)
            line = lines[idx]
            attempts += 1
        
        if attempts >= 10:
            # Couldn't find a suitable line, use the first non-empty line
            for i, line in enumerate(lines):
                if line.strip() and not line.strip().startswith('#'):
                    idx = i
                    break
            else:
                return  # No suitable line found
        
        # Choose a modification type
        mod_type = random.choice([
            "remove_character",
            "add_character",
            "swap_characters",
            "change_indent"
        ])
        
        if mod_type == "remove_character" and line:
            char_idx = random.randint(0, len(line) - 1)
            lines[idx] = line[:char_idx] + line[char_idx+1:]
        
        elif mod_type == "add_character":
            char_idx = random.randint(0, len(line))
            char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."])
            lines[idx] = line[:char_idx] + char + line[char_idx:]
        
        elif mod_type == "swap_characters" and len(line) >= 2:
            char_idx = random.randint(0, len(line) - 2)
            lines[idx] = (line[:char_idx] + line[char_idx+1] + 
                         line[char_idx] + line[char_idx+2:])
        
        elif mod_type == "change_indent":
            # Either add or remove indentation
            if line.startswith("    "):
                lines[idx] = line[2:]  # Remove some indent
            else:
                lines[idx] = "  " + line  # Add inconsistent indent
        
        # Update the code
        problem_state.code_context["code"] = '\n'.join(lines)
        
        # Add information about the bug
        if "bugs" not in problem_state.code_context:
            problem_state.code_context["bugs"] = []
        
        problem_state.code_context["bugs"].append({
            "type": BugCategory.SYNTAX,
            "line": idx + 1,
            "description": f"Syntax error introduced in line {idx + 1}"
        })
    
    def _insert_logical_bug(self, problem_state: ProblemState) -> None:
        """
        Insert a logical bug into the problem state.
        
        Args:
            problem_state: The problem state to modify
        """
        code = problem_state.code_context["code"]
        lines = code.split('\n')
        if not lines:
            return
        
        # Find all if statements and loops
        if_statements = []
        for i, line in enumerate(lines):
            if re.search(r'\bif\b|\bwhile\b|\bfor\b', line):
                if_statements.append((i, line))
        
        # Choose a modification type
        mod_type = random.choice([
            "change_comparison",
            "invert_condition",
            "off_by_one",
            "change_operator",
            "reverse_logic"
        ])
        
        if if_statements:
            # Choose an if statement to modify
            idx, line = random.choice(if_statements)
            
            if mod_type == "change_comparison":
                # Change comparison operators
                comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="}
                for op, new_op in comparisons.items():
                    if op in line:
                        lines[idx] = line.replace(op, new_op, 1)
                        break
            
            elif mod_type == "invert_condition":
                # Add or remove a "not" to invert the condition
                if "not" in line:
                    lines[idx] = line.replace("not ", "", 1)
                else:
                    match = re.search(r'(if|while)\s+([^:]+):', line)
                    if match:
                        condition = match.group(2)
                        lines[idx] = line.replace(condition, f"not ({condition})", 1)
            
            elif mod_type == "off_by_one":
                # Introduce an off-by-one error
                for op in ["+", "-"]:
                    if op in line:
                        # If there's a number after the operator, change it
                        match = re.search(f'\\{op}\\s*(\\d+)', line)
                        if match:
                            num = int(match.group(1))
                            new_num = num + 1 if op == "+" else max(0, num - 1)
                            lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1)
                            break
            
            elif mod_type == "change_operator":
                # Change arithmetic or logical operators
                operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"}
                for op, new_op in operators.items():
                    if f" {op} " in line:
                        lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1)
                        break
            
            elif mod_type == "reverse_logic":
                # Reverse the logic of a compound condition
                if " and " in line:
                    parts = line.split(" and ")
                    lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1)
                elif " or " in line:
                    parts = line.split(" or ")
                    lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1)
        
        else:
            # If no if statements found, introduce a different kind of logical error
            # Find variable assignments
            assignments = []
            for i, line in enumerate(lines):
                if "=" in line and "==" not in line and "!=" not in line:
                    assignments.append((i, line))
            
            if assignments:
                # Choose an assignment to modify
                idx, line = random.choice(assignments)
                
                # Modify the assignment
                if "+" in line:
                    lines[idx] = line.replace("+", "-", 1)
                elif "-" in line:
                    lines[idx] = line.replace("-", "+", 1)
                elif "*" in line:
                    lines[idx] = line.replace("*", "/", 1)
                elif "/" in line:
                    lines[idx] = line.replace("/", "*", 1)
                else:
                    # If no arithmetic operator, change the value
                    match = re.search(r'=\s*(\d+)', line)
                    if match:
                        num = int(match.group(1))
                        new_num = num + random.choice([-1, 1]) * random.randint(1, 3)
                        lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1)
        
        # Update the code
        problem_state.code_context["code"] = '\n'.join(lines)
        
        # Add information about the bug
        if "bugs" not in problem_state.code_context:
            problem_state.code_context["bugs"] = []
        
        problem_state.code_context["bugs"].append({
            "type": BugCategory.LOGICAL,
            "line": idx + 1,
            "description": f"Logical error introduced in line {idx + 1}"
        })
    
    def _insert_performance_bug(self, problem_state: ProblemState) -> None:
        """
        Insert a performance bug into the problem state.
        
        Args:
            problem_state: The problem state to modify
        """
        code = problem_state.code_context["code"]
        lines = code.split('\n')
        if not lines:
            return
        
        # Find functions in the code
        functions = []
        current_func = None
        func_start = None
        for i, line in enumerate(lines):
            if line.strip().startswith("def "):
                if current_func:
                    functions.append((func_start, i - 1, current_func))
                current_func = line.strip()[4:].split("(")[0]
                func_start = i
            elif i == len(lines) - 1 and current_func:
                functions.append((func_start, i, current_func))
        
        if not functions:
            return
        
        # Choose a function to modify
        start_idx, end_idx, func_name = random.choice(functions)
        
        # Choose a modification type
        mod_type = random.choice([
            "add_nested_loop",
            "inefficient_data_structure",
            "redundant_computation"
        ])
        
        if mod_type == "add_nested_loop":
            # Find indentation of the function
            for i in range(start_idx + 1, end_idx + 1):
                if lines[i].strip():
                    indent = len(lines[i]) - len(lines[i].lstrip())
                    break
            else:
                indent = 4
            
            # Find a suitable place to add a nested loop
            for i in range(start_idx + 1, end_idx + 1):
                if "for " in lines[i] or "while " in lines[i]:
                    # Add a nested loop after this loop
                    inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4
                    inner_indent_str = ' ' * inner_indent
                    
                    # Add an unnecessary nested loop
                    lines.insert(i + 1, f"{inner_indent_str}for _ in range(100):  # Inefficient nested loop")
                    lines.insert(i + 2, f"{inner_indent_str}    pass")
                    
                    # Update indices
                    end_idx += 2
                    break
            else:
                # If no loop found, add one at the beginning of the function
                inner_indent = indent + 4
                inner_indent_str = ' ' * inner_indent
                
                # Find the first non-docstring line
                for i in range(start_idx + 1, end_idx + 1):
                    if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
                        # Add an unnecessary loop
                        lines.insert(i, f"{' ' * indent}for i in range(100):  # Inefficient loop")
                        lines.insert(i + 1, f"{inner_indent_str}pass")
                        
                        # Update indices
                        end_idx += 2
                        break
        
        elif mod_type == "ineff
# recursive_swe_bench/task_generators/bug_fixing.py (finalized)

        elif mod_type == "inefficient_data_structure":
            # Find indentation of the function
            for i in range(start_idx + 1, end_idx + 1):
                if lines[i].strip():
                    indent = len(lines[i]) - len(lines[i].lstrip())
                    break
            else:
                indent = 4
            
            # Find a suitable place to add inefficient data structure usage
            for i in range(start_idx + 1, end_idx + 1):
                if "def " not in lines[i] and lines[i].strip():
                    # Add inefficient data structure usage after this line
                    indent_str = ' ' * indent
                    
                    # Add inefficient code
                    lines.insert(i + 1, f"{indent_str}# Inefficient data structure usage")
                    lines.insert(i + 2, f"{indent_str}results = []")
                    lines.insert(i + 3, f"{indent_str}for i in range(1000):  # Unnecessarily large range")
                    lines.insert(i + 4, f"{indent_str}    # Using list instead of set for lookups")
                    lines.insert(i + 5, f"{indent_str}    if i % 10 in results:  # O(n) lookup instead of O(1)")
                    lines.insert(i + 6, f"{indent_str}        results.append(i)  # Unnecessary storage")
                    
                    # Update indices
                    end_idx += 6
                    break
        
        elif mod_type == "redundant_computation":
            # Find indentation of the function
            for i in range(start_idx + 1, end_idx + 1):
                if lines[i].strip():
                    indent = len(lines[i]) - len(lines[i].lstrip())
                    break
            else:
                indent = 4
            
            # Find a suitable place to add redundant computation
            for i in range(start_idx + 1, end_idx + 1):
                if "for " in lines[i] or "while " in lines[i]:
                    # Add redundant computation inside the loop
                    inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4
                    inner_indent_str = ' ' * inner_indent
                    
                    # Add redundant computation
                    lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration")
                    lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0")
                    lines.insert(i + 3, f"{inner_indent_str}for j in range(100):  # Unnecessary nested computation")
                    lines.insert(i + 4, f"{inner_indent_str}    temp_sum += j")
                    
                    # Update indices
                    end_idx += 4
                    break
        
        # Update the code
        problem_state.code_context["code"] = '\n'.join(lines)
        
        # Add information about the bug
        if "bugs" not in problem_state.code_context:
            problem_state.code_context["bugs"] = []
        
        problem_state.code_context["bugs"].append({
            "type": BugCategory.PERFORMANCE,
            "line": start_idx + 1,
            "description": f"Performance issue introduced in function '{func_name}'"
        })
    
    def _insert_edge_case_bug(self, problem_state: ProblemState) -> None:
        """
        Insert an edge case bug into the problem state.
        
        Args:
            problem_state: The problem state to modify
        """
        code = problem_state.code_context["code"]
        lines = code.split('\n')
        if not lines:
            return
        
        # Find functions in the code
        functions = []
        current_func = None
        func_start = None
        for i, line in enumerate(lines):
            if line.strip().startswith("def "):
                if current_func:
                    functions.append((func_start, i - 1, current_func))
                current_func = line.strip()[4:].split("(")[0]
                func_start = i
            elif i == len(lines) - 1 and current_func:
                functions.append((func_start, i, current_func))
        
        if not functions:
            return
        
        # Choose a function to modify
        start_idx, end_idx, func_name = random.choice(functions)
        
        # Choose a modification type
        mod_type = random.choice([
            "remove_boundary_check",
            "missing_edge_case",
            "type_assumption"
        ])
        
        if mod_type == "remove_boundary_check":
            # Find boundary checks (if statements with conditions that check boundaries)
            boundary_checks = []
            for i in range(start_idx + 1, end_idx + 1):
                if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and 
                    (("if not " in lines[i]) or ("if len(" in lines[i]) or 
                     ("if " in lines[i] and " == 0" in lines[i]) or
                     ("if " in lines[i] and " == []" in lines[i]) or
                     ("if " in lines[i] and " == ''" in lines[i]) or
                     ("if " in lines[i] and " is None" in lines[i]))):
                    boundary_checks.append(i)
            
            if boundary_checks:
                # Choose a boundary check to remove
                idx = random.choice(boundary_checks)
                
                # Comment out the boundary check
                lines[idx] = f"# {lines[idx]}  # Boundary check removed"
                
                # Comment out the body of the if statement
                i = idx + 1
                while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())):
                    lines[i] = f"# {lines[i]}"
                    i += 1
            else:
                # If no boundary check found, add code that assumes a non-empty input
                # Find the first non-docstring line in the function
                for i in range(start_idx + 1, end_idx + 1):
                    if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        
                        # Add code that assumes non-empty input
                        lines.insert(i, f"{indent_str}# Missing check for empty input")
                        lines.insert(i + 1, f"{indent_str}first_item = items[0]  # Will fail on empty input")
                        
                        # Update indices
                        end_idx += 2
                        break
        
        elif mod_type == "missing_edge_case":
            # Find a suitable place to insert the bug
            for i in range(start_idx + 1, end_idx + 1):
                if ("/" in lines[i] or 
                    "if " in lines[i] and "==" in lines[i] or 
                    "if " in lines[i] and "!=" in lines[i]):
                    
                    if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]):
                        # Add code that doesn't check for zero division
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        
                        # Extract the denominator
                        match = re.search(r'/\s*(\w+)', lines[i])
                        if match:
                            denominator = match.group(1)
                            
                            # Comment out any existing check
                            j = i - 1
                            while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent:
                                if f"if {denominator}" in lines[j] and "== 0" in lines[j]:
                                    lines[j] = f"# {lines[j]}  # Zero division check removed"
                                j -= 1
                            
                            # Add a comment about the missing check
                            lines.insert(i, f"{indent_str}# Missing check for zero division")
                            
                            # Update indices
                            end_idx += 1
                            break
                    
                    elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]:
                        # Comment out edge case check
                        lines[i] = f"# {lines[i]}  # Edge case check removed"
                        break
            else:
                # If no suitable place found, add code that doesn't handle an edge case
                # Find the first non-docstring line in the function
                for i in range(start_idx + 1, end_idx + 1):
                    if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        
                        # Add code that doesn't handle an edge case
                        lines.insert(i, f"{indent_str}# Missing handling for edge cases")
                        lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly")
                        
                        # Update indices
                        end_idx += 2
                        break
        
        elif mod_type == "type_assumption":
            # Find a suitable place to insert a type assumption bug
            for i in range(start_idx + 1, end_idx + 1):
                if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]:
                    # Extract the variable name
                    var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i])
                    if not var_match:
                        var_match = re.search(r'(\w+)\s*=', lines[i])
                    
                    if var_match:
                        var_name = var_match.group(1)
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        
                        # Add code that assumes a specific type
                        lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list")
                        lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0:  # Will fail if {var_name} doesn't support len()")
                        lines.insert(i + 3, f"{indent_str}    first = {var_name}[0]  # Will fail if {var_name} is not subscriptable")
                        
                        # Update indices
                        end_idx += 3
                        break
            else:
                # If no suitable place found, add code at the beginning of the function
                for i in range(start_idx + 1, end_idx + 1):
                    if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
                        indent = len(lines[i]) - len(lines[i].lstrip())
                        indent_str = ' ' * indent
                        
                        # Extract parameter name
                        param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx])
                        param_name = param_match.group(1) if param_match else "input_data"
                        
                        # Add code that assumes a specific type
                        lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type")
                        lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name})  # Will fail if {param_name} can't be converted to string")
                        
                        # Update indices
                        end_idx += 2
                        break
        
        # Update the code
        problem_state.code_context["code"] = '\n'.join(lines)
        
        # Add information about the bug
        if "bugs" not in problem_state.code_context:
            problem_state.code_context["bugs"] = []
        
        problem_state.code_context["bugs"].append({
            "type": BugCategory.EDGE_CASE,
            "line": start_idx + 1,
            "description": f"Edge case bug introduced in function '{func_name}'"
        })
    
    def _generate_description(self, problem_state: ProblemState) -> str:
        """
        Generate a description for the current problem state.
        
        Args:
            problem_state: The problem state
            
        Returns:
            A descriptive prompt for the problem
        """
        # Base description
        bug_count = problem_state.code_context.get("bug_count", 0)
        plural = "bugs" if bug_count != 1 else "bug"
        
        base_desc = (
            f"Fix the {plural} in the code below. "
            f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix."
        )
        
        # Add information about bug categories
        if "bug_categories" in problem_state.code_context:
            categories = problem_state.code_context["bug_categories"]
            if categories:
                category_desc = ", ".join(categories)
                base_desc += f"\n\nThe code contains the following types of issues: {category_desc}."
        
        # Add requirements
        if problem_state.requirements:
            base_desc += "\n\nRequirements:"
            for i, req in enumerate(problem_state.requirements):
                base_desc += f"\n{i+1}. {req['description']}"
        
        # Add difficulty level
        difficulty_desc = "easy"
        if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6:
            difficulty_desc = "moderate"
        elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8:
            difficulty_desc = "challenging"
        elif problem_state.difficulty > 0.8:
            difficulty_desc = "very challenging"
        
        base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task."
        
        return base_desc


# Default implementation of TestRunner for when no custom runner is provided
class DefaultTestRunner:
    """
    Default test runner for evaluating solutions.
    
    This class runs tests against a solution file and collects the results.
    """
    
    def run_tests(
        self, 
        solution_file: Path, 
        test_files: List[Path],
        code_context: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        Run tests against a solution file.
        
        Args:
            solution_file: Path to the solution file
            test_files: List of test file paths
            code_context: Additional context about the code
            
        Returns:
            Dictionary containing test results
        """
        # Initialize results dictionary
        results = {
            "all_passed": True,
            "passed_tests": 0,
            "total_tests": 0,
            "tests": {},
            "execution": {
                "success": True,
                "error": None,
                "stdout": "",
                "stderr": ""
            },
            "execution_time": 0.0
        }
        
        # Check if solution file exists
        if not solution_file.exists():
            results["execution"]["success"] = False
            results["execution"]["error"] = f"Solution file not found: {solution_file}"
            results["all_passed"] = False
            return results
        
        # Try to import the solution module
        try:
            start_time = time.time()
            
            # Add solution directory to path
            sys.path.insert(0, str(solution_file.parent))
            
            # Import the solution module
            spec = importlib.util.spec_from_file_location(
                "solution", solution_file)
            solution_module = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(solution_module)
            
            # Remove the solution directory from path
            sys.path.pop(0)
            
            # Record execution time
            end_time = time.time()
            results["execution_time"] = end_time - start_time
            
        except Exception as e:
            results["execution"]["success"] = False
            results["execution"]["error"] = str(e)
            results["all_passed"] = False
            return results
        
        # Run each test file
        for test_file in test_files:
            # Skip if the test file doesn't exist
            if not test_file.exists():
                continue
            
            try:
                # Set up test loading
                loader = unittest.TestLoader()
                
                # Add test directory to path
                sys.path.insert(0, str(test_file.parent))
                
                # Capture stdout and stderr
                stdout_buffer = io.StringIO()
                stderr_buffer = io.StringIO()
                
                # Create a test suite from the test file
                test_suite = loader.discover(
                    str(test_file.parent),
                    pattern=test_file.name
                )
                
                # Count test cases
                test_count = 0
                for suite in test_suite:
                    for test_case in suite:
                        test_count += test_case.countTestCases()
                
                results["total_tests"] += test_count
                
                # Run the tests with captured output
                with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
                    test_runner = unittest.TextTestRunner(verbosity=2)
                    test_result = test_runner.run(test_suite)
                
                # Get the captured output
                stdout = stdout_buffer.getvalue()
                stderr = stderr_buffer.getvalue()
                
                # Remove the test directory from path
                sys.path.pop(0)
                
                # Check if all tests passed
                if not test_result.wasSuccessful():
                    results["all_passed"] = False
                
                # Count passed tests
                passed_tests = test_count - len(test_result.failures) - len(test_result.errors)
                results["passed_tests"] += passed_tests
                
                # Store individual test results
                test_name = test_file.stem
                results["tests"][test_name] = {
                    "passed": test_result.wasSuccessful(),
                    "failures": len(test_result.failures),
                    "errors": len(test_result.errors),
                    "skipped": len(test_result.skipped),
                    "total": test_count,
                    "passed_count": passed_tests,
                    "stdout": stdout,
                    "stderr": stderr
                }
                
                # Store details for individual test failures
                for failure in test_result.failures + test_result.errors:
                    test_id = failure[0].id().split('.')[-1]
                    failure_message = failure[1]
                    
                    # Try to extract expected and actual values
                    expected_match = re.search(r'Expected\s*:(.+)', failure_message)
                    actual_match = re.search(r'Actual\s*:(.+)', failure_message)
                    
                    expected = expected_match.group(1).strip() if expected_match else None
                    actual = actual_match.group(1).strip() if actual_match else None
                    
                    if test_id not in results["tests"]:
                        results["tests"][test_id] = {}
                    
                    results["tests"][test_id].update({
                        "passed": False,
                        "message": failure_message,
                        "expected": expected,
                        "actual": actual
                    })
                
            except Exception as e:
                # If there's an error in the test file itself
                results["all_passed"] = False
                test_name = test_file.stem
                results["tests"][test_name] = {
                    "passed": False,
                    "error": str(e),
                    "failures": 0,
                    "errors": 1,
                    "skipped": 0,
                    "total": 1,
                    "passed_count": 0
                }
                results["total_tests"] += 1
        
        return results