recursivelabs's picture
Upload 7 files
e8a0a6a verified
# recursive_swe_bench/task_generators/bug_fixing.py
from typing import Any, Dict, List, Optional, Tuple, Set, Union
import uuid
import json
import re
import random
import ast
import copy
from pathlib import Path
import tempfile
import subprocess
import shutil
import os
from recursive_swe_bench.core.recursive_task import (
RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus
)
class BugCategory:
"""Categories of bugs for classification and evolution."""
SYNTAX = "syntax"
LOGICAL = "logical"
PERFORMANCE = "performance"
SECURITY = "security"
CONCURRENCY = "concurrency"
EXCEPTION_HANDLING = "exception_handling"
API_USAGE = "api_usage"
MEMORY_MANAGEMENT = "memory_management"
TYPE_ERROR = "type_error"
EDGE_CASE = "edge_case"
DATA_HANDLING = "data_handling"
DEPENDENCY = "dependency"
class BugFixingTask(RecursiveTask):
"""
A recursive task for evaluating how models fix bugs in code.
The task presents a piece of code with one or more bugs, and evolves
based on the model's fix attempts. As the model addresses issues,
the task may introduce more subtle bugs, change requirements, or
increase complexity to test adaptive problem-solving.
"""
def __init__(
self,
initial_state: ProblemState,
config: Dict[str, Any] = None,
test_runner: Any = None
):
"""
Initialize the bug fixing task.
Args:
initial_state: The initial problem state
config: Configuration options
test_runner: Custom test runner (optional)
"""
super().__init__(initial_state, config)
self.test_runner = test_runner or DefaultTestRunner()
self.bug_categories: Set[str] = set(
self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX])
)
self.difficulty_progression = self.config.get(
"difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7]
)
self.evolution_strategies = self.config.get(
"evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"]
)
def _run_evaluation(self, solution: str) -> EvaluationResult:
"""
Run tests to evaluate the solution.
Args:
solution: The solution code
Returns:
Evaluation results
"""
# Create a temporary directory to run tests
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Write solution code to file
solution_file = temp_path / "solution.py"
with open(solution_file, "w") as f:
f.write(solution)
# Create test files
test_files = self._create_test_files(temp_path)
# Run tests
results = self.test_runner.run_tests(
solution_file=solution_file,
test_files=test_files,
code_context=self.state.code_context
)
# Calculate score based on test results
score = self._calculate_score(results)
return EvaluationResult(
success=results["all_passed"],
score=score,
execution_results=results["execution"],
error_details=results.get("errors"),
test_results=results["tests"],
metrics={
"passed_tests": results["passed_tests"],
"total_tests": results["total_tests"],
"execution_time": results["execution_time"],
"memory_usage": results.get("memory_usage", 0),
"code_complexity": self._calculate_complexity(solution)
}
)
def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback:
"""
Generate structured feedback based on evaluation results.
Args:
solution: The solution code
result: The evaluation results
Returns:
Structured feedback
"""
issues = []
suggestions = []
focus_areas = []
# Add issues for failing tests
if result.test_results:
for test_name, test_result in result.test_results.items():
if not test_result["passed"]:
issues.append({
"type": "test_failure",
"test": test_name,
"message": test_result.get("message", "Test failed"),
"expected": test_result.get("expected"),
"actual": test_result.get("actual")
})
# Add issues for errors
if result.error_details:
for error_type, error_info in result.error_details.items():
issues.append({
"type": "error",
"error_type": error_type,
"message": error_info.get("message", "An error occurred"),
"location": error_info.get("location")
})
# Generate suggestions based on issues
for issue in issues:
if issue["type"] == "test_failure":
suggestion = self._generate_suggestion_for_test_failure(
issue, solution, result.test_results
)
if suggestion:
suggestions.append(suggestion)
elif issue["type"] == "error":
suggestion = self._generate_suggestion_for_error(
issue, solution
)
if suggestion:
suggestions.append(suggestion)
# Determine focus areas based on issues and task state
focus_areas = self._determine_focus_areas(issues, solution, result)
# Generate adaptation hints based on the current state and results
adaptation_hints = self._generate_adaptation_hints(solution, result)
# Create summary
if result.success:
summary = (
f"Your solution passes all tests with a score of {result.score:.2f}. "
f"The code successfully addresses the bugs in the original implementation."
)
else:
passed = result.metrics.get("passed_tests", 0)
total = result.metrics.get("total_tests", 0)
summary = (
f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. "
f"There are still issues that need to be addressed."
)
return Feedback(
summary=summary,
issues=issues,
suggestions=suggestions,
focus_areas=focus_areas,
adaptation_hints=adaptation_hints
)
def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
"""
Evolve the problem state based on the solution and feedback.
This method implements the recursive nature of the benchmark by
adapting the problem to challenge the model's understanding.
Args:
solution: The attempted solution
result: The evaluation results
feedback: The feedback provided
Returns:
The evolved problem state
"""
# If the solution perfectly solved the problem, make it more challenging
if result.success and result.score > 0.95:
return self._increase_difficulty(solution, result, feedback)
# If the solution was close but not perfect, focus on the remaining issues
elif result.score > 0.7:
return self._focus_remaining_issues(solution, result, feedback)
# If the solution was not very good, provide more guidance
else:
return self._provide_more_guidance(solution, result, feedback)
def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
"""
Increase the difficulty of the problem for models that solved it well.
Args:
solution: The successful solution
result: The evaluation results
feedback: The feedback provided
Returns:
The evolved problem state with increased difficulty
"""
# Create a new state based on the current state
new_state = copy.deepcopy(self.state)
# Increment evolution stage
new_state.evolution_stage += 1
# Increase difficulty based on progression schedule
current_difficulty_idx = min(new_state.evolution_stage,
len(self.difficulty_progression) - 1)
new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
# Select an evolution strategy based on the current state
strategy = self._select_evolution_strategy(solution, result, feedback)
# Apply the selected strategy
if strategy == "add_subtle_bug":
self._add_subtle_bug(new_state, solution)
elif strategy == "change_requirements":
self._change_requirements(new_state, solution)
elif strategy == "increase_complexity":
self._increase_complexity(new_state, solution)
# Update the description to reflect the changes
new_state.description = self._generate_description(new_state)
# Update adaptation vector to guide future evolution
new_state.adaptation_vector = self._calculate_adaptation_vector(
solution, result, feedback
)
return new_state
def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
"""
Evolve the state to focus on remaining issues when the solution is close but not perfect.
Args:
solution: The nearly-successful solution
result: The evaluation results
feedback: The feedback provided
Returns:
The evolved problem state focusing on remaining issues
"""
# Create a new state based on the current state
new_state = copy.deepcopy(self.state)
# Increment evolution stage
new_state.evolution_stage += 1
# Maintain the same difficulty level
current_difficulty_idx = min(new_state.evolution_stage - 1,
len(self.difficulty_progression) - 1)
new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
# Update the code context to focus on remaining issues
new_state.code_context["focus_areas"] = feedback.focus_areas
# Highlight failing tests in the code context
if result.test_results:
failing_tests = [
test_name for test_name, test_result in result.test_results.items()
if not test_result["passed"]
]
new_state.code_context["failing_tests"] = failing_tests
# Update the description to be more specific about remaining issues
new_state.description = self._generate_focused_description(
new_state, feedback.issues
)
# Update adaptation vector to guide future evolution
new_state.adaptation_vector = self._calculate_adaptation_vector(
solution, result, feedback
)
return new_state
def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState:
"""
Evolve the state to provide more guidance when the solution was not very good.
Args:
solution: The unsuccessful solution
result: The evaluation results
feedback: The feedback provided
Returns:
The evolved problem state with more guidance
"""
# Create a new state based on the current state
new_state = copy.deepcopy(self.state)
# Increment evolution stage
new_state.evolution_stage += 1
# Maintain or slightly decrease difficulty
current_difficulty_idx = max(0, min(new_state.evolution_stage - 1,
len(self.difficulty_progression) - 1) - 1)
new_state.difficulty = self.difficulty_progression[current_difficulty_idx]
# Add more hints to the code context
new_state.code_context["hints"] = self._generate_hints(
solution, result, feedback
)
# Add more detailed information about failing tests
if result.test_results:
detailed_test_results = {}
for test_name, test_result in result.test_results.items():
if not test_result["passed"]:
detailed_test_results[test_name] = {
"message": test_result.get("message", "Test failed"),
"expected": test_result.get("expected"),
"actual": test_result.get("actual"),
"hint": self._generate_test_hint(test_name, test_result)
}
new_state.code_context["detailed_test_results"] = detailed_test_results
# Update the description to include more guidance
new_state.description = self._generate_guided_description(
new_state, feedback.issues, feedback.suggestions
)
# Update adaptation vector to guide future evolution
new_state.adaptation_vector = self._calculate_adaptation_vector(
solution, result, feedback
)
return new_state
def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str:
"""
Select an evolution strategy based on the current state and solution.
Args:
solution: The current solution
result: The evaluation results
feedback: The feedback provided
Returns:
The selected evolution strategy
"""
available_strategies = self.evolution_strategies.copy()
# Weight the strategies based on the current state
weights = {}
# Prefer adding subtle bugs if the solution is very good
if result.score > 0.95:
weights["add_subtle_bug"] = 0.6
weights["change_requirements"] = 0.3
weights["increase_complexity"] = 0.1
# Prefer changing requirements if we've already added several bugs
elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3:
weights["add_subtle_bug"] = 0.1
weights["change_requirements"] = 0.7
weights["increase_complexity"] = 0.2
# Prefer increasing complexity if the solution is good but not perfect
elif result.score > 0.85:
weights["add_subtle_bug"] = 0.2
weights["change_requirements"] = 0.2
weights["increase_complexity"] = 0.6
# Default to equal weights
else:
weights = {strategy: 1.0 / len(available_strategies)
for strategy in available_strategies}
# Normalize weights for available strategies
total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies)
normalized_weights = [weights.get(strategy, 0) / total_weight
for strategy in available_strategies]
# Select a strategy based on weights
return random.choices(available_strategies, weights=normalized_weights)[0]
def _add_subtle_bug(self, state: ProblemState, solution: str) -> None:
"""
Add a subtle bug to the solution code.
Args:
state: The problem state to modify
solution: The current solution
"""
# Parse the solution to find potential bug insertion points
try:
parsed_solution = ast.parse(solution)
except SyntaxError:
# If we can't parse the solution, just add a syntax error
self._add_syntax_error(state, solution)
return
# Choose a bug category based on available categories
available_categories = list(self.bug_categories)
if available_categories:
bug_category = random.choice(available_categories)
else:
bug_category = BugCategory.LOGICAL
# Add a bug based on the selected category
if bug_category == BugCategory.SYNTAX:
self._add_syntax_error(state, solution)
elif bug_category == BugCategory.LOGICAL:
self._add_logical_error(state, solution, parsed_solution)
elif bug_category == BugCategory.PERFORMANCE:
self._add_performance_issue(state, solution, parsed_solution)
elif bug_category == BugCategory.EDGE_CASE:
self._add_edge_case_issue(state, solution, parsed_solution)
else:
# Default to logical error
self._add_logical_error(state, solution, parsed_solution)
# Update bug count in code context
if "bug_count" not in state.code_context:
state.code_context["bug_count"] = 0
state.code_context["bug_count"] += 1
# Add the bug category to the context
if "bug_categories" not in state.code_context:
state.code_context["bug_categories"] = []
state.code_context["bug_categories"].append(bug_category)
def _change_requirements(self, state: ProblemState, solution: str) -> None:
"""
Change the requirements to challenge the current solution.
Args:
state: The problem state to modify
solution: The current solution
"""
# Get the current requirements
requirements = state.requirements
# Add a new requirement
new_requirement = self._generate_new_requirement(state, solution)
if new_requirement:
requirements.append(new_requirement)
# Modify an existing requirement if possible
if requirements and random.random() < 0.5:
idx = random.randint(0, len(requirements) - 1)
requirements[idx] = self._modify_requirement(requirements[idx], state, solution)
def _increase_complexity(self, state: ProblemState, solution: str) -> None:
"""
Increase the complexity of the task.
Args:
state: The problem state to modify
solution: The current solution
"""
# Parse the solution if possible
try:
parsed_solution = ast.parse(solution)
except SyntaxError:
# If we can't parse the solution, make a simpler change
self._add_edge_case_requirement(state)
return
# Choose a complexity increase strategy
strategies = [
"add_edge_cases",
"increase_data_volume",
"add_performance_constraint",
"expand_functionality"
]
strategy = random.choice(strategies)
if strategy == "add_edge_cases":
self._add_edge_case_requirement(state)
elif strategy == "increase_data_volume":
self._increase_data_volume(state, solution)
elif strategy == "add_performance_constraint":
self._add_performance_constraint(state, solution)
elif strategy == "expand_functionality":
self._expand_functionality(state, solution)
def _create_test_files(self, temp_path: Path) -> List[Path]:
"""
Create test files based on the current problem state.
Args:
temp_path: The temporary directory path
Returns:
List of test file paths
"""
test_files = []
# Create test files from the code context
if "tests" in self.state.code_context:
for i, test in enumerate(self.state.code_context["tests"]):
test_file = temp_path / f"test_{i}.py"
with open(test_file, "w") as f:
f.write(test["content"])
test_files.append(test_file)
# Create a default test file if no tests are specified
if not test_files:
test_file = temp_path / "test_default.py"
with open(test_file, "w") as f:
f.write(self._generate_default_test())
test_files.append(test_file)
return test_files
def _calculate_score(self, results: Dict[str, Any]) -> float:
"""
Calculate a score based on test results.
Args:
results: The test results
Returns:
A score between 0 and 1
"""
# Base score on test results
if results["total_tests"] == 0:
test_score = 0.0
else:
test_score = results["passed_tests"] / results["total_tests"]
# Adjust for execution success
execution_score = 1.0 if results["execution"]["success"] else 0.0
# Combine scores with weights
weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3})
score = (test_score * weights["test"] + execution_score * weights["execution"])
# Apply difficulty modifier
difficulty_modifier = 1.0 + (self.state.difficulty * 0.2)
score = score / difficulty_modifier
return max(0.0, min(1.0, score))
def _calculate_complexity(self, code: str) -> float:
"""
Calculate the complexity of code.
Args:
code: The code to analyze
Returns:
A complexity score
"""
# Simple cyclomatic complexity estimation
complexity = 1
# Count control flow statements
for pattern in ["if", "for", "while", "and", "or"]:
complexity += code.count(f" {pattern} ")
# Count function definitions
complexity += code.count("def ")
# Normalize to 0-1 range
normalized = min(1.0, complexity / 50.0)
return normalized
def _generate_suggestion_for_test_failure(
self,
issue: Dict[str, Any],
solution: str,
test_results: Dict[str, Any]
) -> Dict[str, Any]:
"""
Generate a suggestion for a test failure.
Args:
issue: The issue data
solution: The solution code
test_results: The test results
Returns:
A suggestion dictionary
"""
test_name = issue["test"]
test_result = test_results[test_name]
# Extract relevant parts of the test
test_content = None
for test in self.state.code_context.get("tests", []):
if test.get("name") == test_name:
test_content = test.get("content")
break
if test_content:
# Try to extract the assertion that failed
assertion_match = re.search(r"assert.*", test_content)
assertion = assertion_match.group(0) if assertion_match else None
# Look for function names in both test and solution
test_funcs = re.findall(r"def\s+(\w+)", test_content)
solution_funcs = re.findall(r"def\s+(\w+)", solution)
# Find functions in test that aren't in solution
missing_funcs = [f for f in test_funcs if f not in solution_funcs]
if missing_funcs:
return {
"type": "missing_function",
"message": f"Implement the missing function(s): {', '.join(missing_funcs)}",
"functions": missing_funcs
}
elif assertion:
return {
"type": "fix_assertion_failure",
"message": f"Fix the code to pass the assertion: {assertion}",
"assertion": assertion,
"expected": test_result.get("expected"),
"actual": test_result.get("actual")
}
else:
return {
"type": "fix_test_failure",
"message": f"Fix the code to pass the test: {test_name}",
"test_name": test_name
}
else:
return {
"type": "general_fix",
"message": f"Fix the code to pass the failing test: {test_name}"
}
def _generate_suggestion_for_error(
self,
issue: Dict[str, Any],
solution: str
) -> Dict[str, Any]:
"""
Generate a suggestion for an error.
Args:
issue: The issue data
solution: The solution code
Returns:
A suggestion dictionary
"""
error_type = issue["error_type"]
message = issue["message"]
location = issue.get("location")
if error_type == "syntax":
return {
"type": "fix_syntax",
"message": f"Fix the syntax error: {message}",
"location": location
}
elif error_type == "runtime":
return {
"type": "fix_runtime_error",
"message": f"Fix the runtime error: {message}",
"location": location
}
else:
return {
"type": "fix_error",
"message": f"Fix the error: {message}",
"error_type": error_type,
"location": location
}
def _determine_focus_areas(
self,
issues: List[Dict[str, Any]],
solution: str,
result: EvaluationResult
) -> List[str]:
"""
Determine focus areas based on issues and results.
Args:
issues: The identified issues
solution: The solution code
result: The evaluation results
Returns:
List of focus areas
"""
focus_areas = []
# Check for syntax issues
syntax_issues = [i for i in issues if i.get("error_type") == "syntax"]
if syntax_issues:
focus_areas.append("syntax")
# Check for failing tests
test_issues = [i for i in issues if i["type"] == "test_failure"]
if test_issues:
if any("expected" in i and "actual" in i for i in test_issues):
focus_areas.append("logic")
else:
focus_areas.append("functionality")
# Check for performance issues
if result.metrics and "execution_time" in result.metrics:
if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0):
focus_areas.append("performance")
# Check for complexity issues
if result.metrics and "code_complexity" in result.metrics:
if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7):
focus_areas.append("complexity")
# Default focus area if none were identified
if not focus_areas:
focus_areas.append("general")
return focus_areas
def _generate_adaptation_hints(
self,
solution: str,
result: EvaluationResult
) -> List[Dict[str, Any]]:
"""
Generate hints about how the problem might adapt in the next iteration.
Args:
solution: The solution code
result: The evaluation results
Returns:
List of adaptation hints
"""
hints = []
# Hint about potential complexity increases
if result.score > 0.8:
hints.append({
"type": "complexity_increase",
"message": "The problem may become more complex in the next iteration."
})
# Hint about potential requirement changes
if result.score > 0.9 and self.state.evolution_stage >= 1:
hints.append({
"type": "requirement_change",
"message": "The requirements may change in the next iteration."
})
# Hint about potential bug additions
if result.score > 0.95:
hints.append({
"type": "new_bugs",
"message": "New, more subtle bugs may be introduced in the next iteration."
})
# Hint about focus on specific areas
if result.score > 0.7 and result.score < 0.95:
focus_areas = result.metrics.get("focus_areas", [])
if focus_areas:
hints.append({
"type": "focus_shift",
"message": f"The next iteration may focus more on: {', '.join(focus_areas)}",
"areas": focus_areas
})
return hints
def _generate_description(self, state: ProblemState) -> str:
"""
Generate a description for the current problem state.
Args:
state: The problem state
Returns:
A descriptive prompt for the problem
"""
# Base description
base_desc = (
f"Fix the bug(s) in the following code. "
f"This is iteration {state.evolution_stage + 1} of the task."
)
# Add information about known bug categories
if "bug_categories" in state.code_context:
categories = state.code_context["bug_categories"]
if categories:
base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}."
# Add requirements
if state.requirements:
base_desc += "\n\nRequirements:"
for i, req in enumerate(state.requirements):
base_desc += f"\n{i+1}. {req['description']}"
# Add information about difficulty
difficulty_desc = "easy"
if state.difficulty > 0.3 and state.difficulty <= 0.6:
difficulty_desc = "moderate"
elif state.difficulty > 0.6 and state.difficulty <= 0.8:
difficulty_desc = "challenging"
elif state.difficulty > 0.8:
difficulty_desc = "very challenging"
base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task."
return base_desc
def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str:
"""
Generate a description focused on remaining issues.
Args:
state: The problem state
issues: The identified issues
Returns:
A descriptive prompt focused on remaining issues
"""
base_desc = self._generate_description(state)
# Add focus on remaining issues
if issues:
base_desc += "\n\nFocus on the following issues:"
for i, issue in enumerate(issues):
if issue["type"] == "test_failure":
base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}"
else:
base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}"
# Add focus areas if present
if "focus_areas" in state.code_context:
areas = state.code_context["focus_areas"]
if areas:
base_desc += f"\n\nPay particular attention to: {', '.join(areas)}."
return base_desc
def _generate_guided_description(
self,
state: ProblemState,
issues: List[Dict[str, Any]],
suggestions: List[Dict[str, Any]]
) -> str:
"""
Generate a description with added guidance.
Args:
state: The problem state
issues: The identified issues
suggestions: The suggested fixes
Returns:
A descriptive prompt with added guidance
"""
base_desc = self._generate_description(state)
# Add detailed information about issues
if issues:
base_desc += "\n\nThe following issues were identified in your previous solution:"
for i, issue in enumerate(issues):
if issue["type"] == "test_failure":
base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}"
if "expected" in issue and "actual" in issue:
base_desc += f"\n Expected: {issue['expected']}"
base_desc += f"\n Actual: {issue['actual']}"
else:
base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}"
if "location" in issue:
base_desc += f"\n Location: {issue['location']}"
# Add suggestions
if suggestions:
base_desc += "\n\nConsider the following suggestions:"
for i, suggestion in enumerate(suggestions):
base_desc += f"\n{i+1}. {suggestion['message']}"
# Add hints if present
if "hints" in state.code_context:
hints = state.code_context["hints"]
if hints:
base_desc += "\n\nHints:"
for i, hint in enumerate(hints):
base_desc += f"\n{i+1}. {hint}"
return base_desc
def _generate_hints(
self,
solution: str,
result: EvaluationResult,
feedback: Feedback
) -> List[str]:
"""
Generate hints based on the solution and feedback.
Args:
solution: The solution code
result: The evaluation results
feedback: The feedback provided
Returns:
List of hints
"""
hints = []
# Add hints based on failing tests
if result.test_results:
failing_tests = [
test_name for test_name, test_result in result.test_results.items()
if not test_result["passed"]
]
if failing_tests:
test_hint = "Focus on fixing the failing tests"
# Add specific information about test expectations if available
for test_name in failing_tests[:2]: # Limit to first two tests
test_result = result.test_results[test_name]
if "expected" in test_result and "actual" in test_result:
test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'"
hints.append(test_hint + ".")
# Add hints based on errors
if result.error_details:
for error_type, error_info in result.error_details.items():
hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.")
# Add hints based on focus areas
for area in feedback.focus_areas:
if area == "syntax":
hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.")
elif area == "logic":
hints.append("Review the logic of your solution, especially conditional statements and loop conditions.")
elif area == "functionality":
hints.append("Ensure your solution implements all required functionality specified in the tests.")
elif area == "performance":
hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.")
elif area == "complexity":
hints.append("Try to simplify your solution, it may be more complex than necessary.")
return hints
def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str:
"""
Generate a hint for a specific failing test.
Args:
test_name: The name of the test
test_result: The test result
Returns:
A hint for the test
"""
if "expected" in test_result and "actual" in test_result:
return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'"
elif "message" in test_result:
return test_result["message"]
else:
return "The test failed, but no detailed information is available."
def _add_syntax_error(self, state: ProblemState, solution: str) -> None:
"""
Add a syntax error to the solution code.
Args:
state: The problem state to modify
solution: The current solution
"""
lines = solution.split('\n')
if not lines:
return
# Choose a line to modify
idx = random.randint(0, len(lines) - 1)
line = lines[idx]
# Skip empty lines or comment lines
while not line.strip() or line.strip().startswith('#'):
idx = random.randint(0, len(lines) - 1)
line = lines[idx]
# Choose a modification type
mod_type = random.choice([
"remove_character",
"add_character",
"swap_characters",
"change_indent"
])
if mod_type == "remove_character" and line:
char_idx = random.randint(0, len(line) - 1)
lines[idx] = line[:char_idx] + line[char_idx+1:]
elif mod_type == "add_character":
char_idx = random.randint(0, len(line))
char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."])
lines[idx] = line[:char_idx] + char + line[char_idx:]
elif mod_type == "swap_characters" and len(line) >= 2:
char_idx = random.randint(0, len(line) - 2)
lines[idx] = (line[:char_idx] + line[char_idx+1] +
line[char_idx] + line[char_idx+2:])
elif mod_type == "change_indent":
# Either add or remove indentation
if line.startswith(" "):
lines[idx] = line[2:] # Remove some indent
else:
lines[idx] = " " + line # Add inconsistent indent
# Update the code
modified_code = '\n'.join(lines)
state.code_context["code"] = modified_code
# Add information about the modification
if "bugs" not in state.code_context:
state.code_context["bugs"] = []
state.code_context["bugs"].append({
"type": "syntax",
"line": idx + 1,
"description": f"Syntax error introduced in line {idx + 1}"
})
def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
"""
Add a logical error to the solution code.
Args:
state: The problem state to modify
solution: The current solution
parsed_solution: The parsed AST of the solution
"""
modification_types = [
"change_comparison",
"invert_condition",
"off_by_one",
"change_operator",
"reverse_logic"
]
mod_type = random.choice(modification_types)
lines = solution.split('\n')
# Find all if statements and loops
if_statements = []
for i, line in enumerate(lines):
if re.search(r'\bif\b|\bwhile\b|\bfor\b', line):
if_statements.append((i, line))
if if_statements:
# Choose an if statement to modify
idx, line = random.choice(if_
# recursive_swe_bench/task_generators/bug_fixing.py (continued)
if if_statements:
# Choose an if statement to modify
idx, line = random.choice(if_statements)
if mod_type == "change_comparison":
# Change comparison operators
comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="}
for op, new_op in comparisons.items():
if op in line:
lines[idx] = line.replace(op, new_op, 1)
break
elif mod_type == "invert_condition":
# Add or remove a "not" to invert the condition
if "not" in line:
lines[idx] = line.replace("not ", "", 1)
else:
match = re.search(r'(if|while)\s+([^:]+):', line)
if match:
condition = match.group(2)
lines[idx] = line.replace(condition, f"not ({condition})", 1)
elif mod_type == "off_by_one":
# Introduce an off-by-one error
for op in ["+", "-"]:
if op in line:
# If there's a number after the operator, change it
match = re.search(f'\\{op}\\s*(\\d+)', line)
if match:
num = int(match.group(1))
new_num = num + 1 if op == "+" else max(0, num - 1)
lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1)
break
elif mod_type == "change_operator":
# Change arithmetic or logical operators
operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"}
for op, new_op in operators.items():
if f" {op} " in line:
lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1)
break
elif mod_type == "reverse_logic":
# Reverse the logic of a compound condition
if " and " in line:
parts = line.split(" and ")
lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1)
elif " or " in line:
parts = line.split(" or ")
lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1)
else:
# If no if statements found, introduce a different kind of logical error
# Find variable assignments
assignments = []
for i, line in enumerate(lines):
if "=" in line and "==" not in line and "!=" not in line:
assignments.append((i, line))
if assignments:
# Choose an assignment to modify
idx, line = random.choice(assignments)
# Modify the assignment
if "+" in line:
lines[idx] = line.replace("+", "-", 1)
elif "-" in line:
lines[idx] = line.replace("-", "+", 1)
elif "*" in line:
lines[idx] = line.replace("*", "/", 1)
elif "/" in line:
lines[idx] = line.replace("/", "*", 1)
else:
# If no arithmetic operator, change the value
match = re.search(r'=\s*(\d+)', line)
if match:
num = int(match.group(1))
new_num = num + random.choice([-1, 1]) * random.randint(1, 3)
lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1)
# Update the code
modified_code = '\n'.join(lines)
state.code_context["code"] = modified_code
# Add information about the modification
if "bugs" not in state.code_context:
state.code_context["bugs"] = []
state.code_context["bugs"].append({
"type": "logical",
"line": idx + 1,
"description": f"Logical error introduced in line {idx + 1}: {mod_type}"
})
def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
"""
Add a performance issue to the solution code.
Args:
state: The problem state to modify
solution: The current solution
parsed_solution: The parsed AST of the solution
"""
lines = solution.split('\n')
# Find loops in the code
loops = []
for i, line in enumerate(lines):
if re.search(r'\bfor\b|\bwhile\b', line):
loops.append((i, line))
if loops:
# Choose a loop to modify
idx, line = random.choice(loops)
# Choose a modification type
mod_type = random.choice([
"add_nested_loop",
"replace_efficient_operation",
"add_redundant_computation"
])
if mod_type == "add_nested_loop":
# Add a nested loop
indent = len(line) - len(line.lstrip())
indent_str = ' ' * indent
loop_body_indent = indent_str + ' '
# Find the next line with the same indentation or less
end_idx = idx + 1
while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent):
end_idx += 1
# Insert a nested loop before the end of the current loop
insert_pos = end_idx
lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100): # Unnecessary loop")
lines.insert(insert_pos + 1, f"{loop_body_indent} pass")
elif mod_type == "replace_efficient_operation":
# Replace an efficient operation with a less efficient one
# Look for list comprehensions or efficient operations
for i in range(idx + 1, min(idx + 10, len(lines))):
if "append" in lines[i] or "extend" in lines[i]:
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
match = re.search(r'(\w+)\.(append|extend)', lines[i])
if match:
list_name = match.group(1)
operation = match.group(2)
item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0]
if operation == "append":
# Replace append with concatenation
lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}] # Less efficient than append"
elif operation == "extend":
# Replace extend with concatenation
lines[i] = f"{indent_str}{list_name} = {list_name} + {item} # Less efficient than extend"
break
elif mod_type == "add_redundant_computation":
# Add redundant computation inside the loop
# Find the indentation level of the loop body
if idx + 1 < len(lines):
body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip())
body_indent_str = ' ' * body_indent
# Add redundant computation
lines.insert(idx + 1, f"{body_indent_str}temp = [] # Redundant computation")
lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):")
lines.insert(idx + 3, f"{body_indent_str} temp.append(i)")
lines.insert(idx + 4, f"{body_indent_str} temp.sort() # Unnecessary sort in each iteration")
else:
# If no loops found, introduce inefficient data structure or algorithm
function_defs = []
for i, line in enumerate(lines):
if line.strip().startswith("def "):
function_defs.append((i, line))
if function_defs:
# Choose a function to modify
idx, line = random.choice(function_defs)
# Find the indentation level of the function body
if idx + 1 < len(lines):
body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip())
body_indent_str = ' ' * body_indent
# Add inefficient code at the beginning of the function
lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage")
lines.insert(idx + 2, f"{body_indent_str}data = []")
lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):")
lines.insert(idx + 4, f"{body_indent_str} data.append(i)")
lines.insert(idx + 5, f"{body_indent_str} # Inefficient search operation")
lines.insert(idx + 6, f"{body_indent_str} if i in data: # Linear search instead of using a set")
lines.insert(idx + 7, f"{body_indent_str} pass")
# Update the code
modified_code = '\n'.join(lines)
state.code_context["code"] = modified_code
# Add information about the modification
if "bugs" not in state.code_context:
state.code_context["bugs"] = []
state.code_context["bugs"].append({
"type": "performance",
"line": idx + 1,
"description": f"Performance issue introduced around line {idx + 1}"
})
def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None:
"""
Add an edge case issue to the solution code.
Args:
state: The problem state to modify
solution: The current solution
parsed_solution: The parsed AST of the solution
"""
lines = solution.split('\n')
# Find functions in the code
functions = []
current_func = None
func_start = None
for i, line in enumerate(lines):
if line.strip().startswith("def "):
if current_func:
functions.append((func_start, i - 1, current_func))
current_func = line.strip()[4:].split("(")[0]
func_start = i
elif i == len(lines) - 1 and current_func:
functions.append((func_start, i, current_func))
if functions:
# Choose a function to modify
start_idx, end_idx, func_name = random.choice(functions)
# Choose a modification type
mod_type = random.choice([
"remove_boundary_check",
"introduce_zero_division",
"handling_empty_input",
"type_assumption"
])
if mod_type == "remove_boundary_check":
# Find and remove or modify boundary checks
for i in range(start_idx, end_idx + 1):
if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]):
# Comment out the boundary check
lines[i] = f"# {lines[i]} # Boundary check removed"
# Skip the body of the if statement
j = i + 1
indent = len(lines[i]) - len(lines[i].lstrip())
body_indent = indent + 4
while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent):
lines[j] = f"# {lines[j]}"
j += 1
break
elif mod_type == "introduce_zero_division":
# Find division operations and modify them
for i in range(start_idx, end_idx + 1):
if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]:
# Remove denominator check if it exists
if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]):
lines[i] = f"# {lines[i]} # Denominator check removed"
else:
# Or modify a division to potentially cause zero division
match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i])
if match:
denominator = match.group(2)
# Add a potential zero value for the denominator
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
lines.insert(i, f"{indent_str}if random.random() < 0.1: # Introduce potential zero division")
lines.insert(i + 1, f"{indent_str} {denominator} = 0")
break
elif mod_type == "handling_empty_input":
# Modify parameter handling to not handle empty inputs correctly
params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx])
if params and params.group(1):
param_list = [p.strip() for p in params.group(1).split(",")]
if param_list:
param = param_list[0].split("=")[0].strip()
# Find checks for the parameter
for i in range(start_idx + 1, end_idx + 1):
if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]):
# Comment out the empty check
lines[i] = f"# {lines[i]} # Empty input check removed"
# Skip the body of the if statement
j = i + 1
indent = len(lines[i]) - len(lines[i].lstrip())
body_indent = indent + 4
while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent):
lines[j] = f"# {lines[j]}"
j += 1
break
elif mod_type == "type_assumption":
# Introduce assumptions about parameter types
params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx])
if params and params.group(1):
param_list = [p.strip() for p in params.group(1).split(",")]
if param_list:
param = param_list[0].split("=")[0].strip()
# Find type checks for the parameter
type_check_found = False
for i in range(start_idx + 1, end_idx + 1):
if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]):
# Comment out the type check
lines[i] = f"# {lines[i]} # Type check removed"
type_check_found = True
break
if not type_check_found:
# Add a problematic type assumption
indent = 4 # Assume basic indentation
for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)):
if lines[i].strip():
indent = len(lines[i]) - len(lines[i].lstrip())
break
indent_str = ' ' * indent
# Add code that assumes a specific type
lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking")
lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param}) # Will fail if {param} doesn't support len()")
# Update the code
modified_code = '\n'.join(lines)
state.code_context["code"] = modified_code
# Add information about the modification
if "bugs" not in state.code_context:
state.code_context["bugs"] = []
state.code_context["bugs"].append({
"type": "edge_case",
"line": start_idx + 1,
"description": f"Edge case issue introduced in function '{func_name}': {mod_type}"
})
def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]:
"""
Generate a new requirement based on the current state and solution.
Args:
state: The current problem state
solution: The current solution
Returns:
A new requirement dictionary
"""
# Parse the solution to find functions and variables
function_names = re.findall(r'def\s+(\w+)', solution)
variable_names = re.findall(r'(\w+)\s*=', solution)
# Choose a requirement type
req_type = random.choice([
"edge_case_handling",
"performance_improvement",
"error_handling",
"type_checking",
"feature_addition"
])
if req_type == "edge_case_handling":
if function_names:
func_name = random.choice(function_names)
edge_cases = [
"empty input",
"negative values",
"zero values",
"extremely large values",
"special characters",
"duplicate values"
]
edge_case = random.choice(edge_cases)
return {
"type": "edge_case_handling",
"description": f"The function '{func_name}' should handle {edge_case} correctly.",
"difficulty": random.uniform(0.3, 0.7)
}
elif req_type == "performance_improvement":
return {
"type": "performance_improvement",
"description": "The solution should be optimized to run in O(n) time or better.",
"difficulty": random.uniform(0.4, 0.8)
}
elif req_type == "error_handling":
error_types = [
"invalid input",
"division by zero",
"file not found",
"network timeout",
"permission denied"
]
error_type = random.choice(error_types)
return {
"type": "error_handling",
"description": f"The code should handle {error_type} errors gracefully.",
"difficulty": random.uniform(0.2, 0.6)
}
elif req_type == "type_checking":
if function_names:
func_name = random.choice(function_names)
return {
"type": "type_checking",
"description": f"The function '{func_name}' should validate input types before processing.",
"difficulty": random.uniform(0.1, 0.5)
}
elif req_type == "feature_addition":
features = [
"logging capability",
"progress tracking",
"caching for repeated operations",
"parameter validation",
"configuration options"
]
feature = random.choice(features)
return {
"type": "feature_addition",
"description": f"Add {feature} to the solution.",
"difficulty": random.uniform(0.3, 0.7)
}
# Default requirement if none of the above were applicable
return {
"type": "general_improvement",
"description": "Improve the overall code quality and readability.",
"difficulty": random.uniform(0.1, 0.4)
}
def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]:
"""
Modify an existing requirement to make it more challenging.
Args:
requirement: The requirement to modify
state: The current problem state
solution: The current solution
Returns:
The modified requirement
"""
# Make a copy of the requirement
modified_req = copy.deepcopy(requirement)
# Increase the difficulty
modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3))
# Modify the description based on the requirement type
if requirement["type"] == "edge_case_handling":
modified_req["description"] += " Additionally, it should handle very large inputs efficiently."
elif requirement["type"] == "performance_improvement":
modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)")
elif requirement["type"] == "error_handling":
modified_req["description"] += " And provide detailed error messages for debugging."
elif requirement["type"] == "type_checking":
modified_req["description"] += " And automatically convert types when possible."
elif requirement["type"] == "feature_addition":
modified_req["description"] += " Ensure this feature is configurable via parameters."
else:
modified_req["description"] += " The code should also be well-documented with comments."
return modified_req
def _add_edge_case_requirement(self, state: ProblemState) -> None:
"""
Add a requirement for handling edge cases.
Args:
state: The problem state to modify
"""
edge_cases = [
"empty collections",
"null/None values",
"boundary values (min/max)",
"negative numbers",
"special characters",
"Unicode characters",
"very large inputs",
"malformed input"
]
edge_case = random.choice(edge_cases)
# Add a new requirement
state.requirements.append({
"type": "edge_case_handling",
"description": f"The solution must correctly handle {edge_case}.",
"difficulty": random.uniform(0.3, 0.7)
})
# Add test cases for the edge case if tests exist
if "tests" in state.code_context:
# Create a new test for the edge case
test_template = self._generate_edge_case_test(edge_case, state.code_context)
if test_template:
state.code_context["tests"].append({
"name": f"test_edge_case_{len(state.code_context['tests'])}",
"content": test_template,
"description": f"Test handling of {edge_case}"
})
def _increase_data_volume(self, state: ProblemState, solution: str) -> None:
"""
Modify the problem to require handling larger data volumes.
Args:
state: The problem state to modify
solution: The current solution
"""
# Add a requirement for handling large data
state.requirements.append({
"type": "scalability",
"description": "The solution must efficiently handle large datasets (10,000+ items).",
"difficulty": random.uniform(0.5, 0.8)
})
# Modify existing tests to use larger data if tests exist
if "tests" in state.code_context:
for i, test in enumerate(state.code_context["tests"]):
content = test["content"]
# Look for small lists or arrays in tests
for pattern, replacement in [
(r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'),
(r'range\(\d+\)', 'range(10000)'),
(r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"')
]:
match = re.search(pattern, content)
if match and random.random() < 0.3: # Only replace some instances
content = content.replace(match.group(0), replacement, 1)
break
state.code_context["tests"][i]["content"] = content
state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)"
def _add_performance_constraint(self, state: ProblemState, solution: str) -> None:
"""
Add a performance constraint to the problem.
Args:
state: The problem state to modify
solution: The current solution
"""
# Choose a performance constraint
constraints = [
"linear time complexity (O(n))",
"logarithmic time complexity (O(log n))",
"constant memory usage (O(1) space)",
"execution time under 100ms for large inputs",
"minimal function calls"
]
constraint = random.choice(constraints)
# Add a new requirement
state.requirements.append({
"type": "performance",
"description": f"The solution must achieve {constraint}.",
"difficulty": random.uniform(0.6, 0.9)
})
# Add performance testing code if tests exist
if "tests" in state.code_context:
# Add a performance test
perf_test = self._generate_performance_test(constraint, state.code_context)
if perf_test:
state.code_context["tests"].append({
"name": f"test_performance_{len(state.code_context['tests'])}",
"content": perf_test,
"description": f"Test {constraint}"
})
def _expand_functionality(self, state: ProblemState, solution: str) -> None:
"""
Expand the required functionality of the solution.
Args:
state: The problem state to modify
solution: The current solution
"""
# Choose a functionality expansion
expansions = [
"support for different input types",
"parameterized behavior",
"additional output formats",
"flexible error handling",
"integration with external systems"
]
expansion = random.choice(expansions)
# Add a new requirement
state.requirements.append({
"type": "functionality",
"description": f"Expand the solution to include {expansion}.",
"difficulty": random.uniform(0.4, 0.8)
})
# Add test cases for the new functionality if tests exist
if "tests" in state.code_context:
# Create a new test for the expanded functionality
test_template = self._generate_functionality_test(expansion, state.code_context)
if test_template:
state.code_context["tests"].append({
"name": f"test_expanded_functionality_{len(state.code_context['tests'])}",
"content": test_template,
"description": f"Test {expansion}"
})
def _generate_default_test(self) -> str:
"""
Generate a default test based on the current problem state.
Returns:
A default test script
"""
# Generate a basic test script
return """
import unittest
import sys
import os
# Add the directory containing the solution to the path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import the solution
from solution import *
class DefaultTest(unittest.TestCase):
def test_basic_functionality(self):
# A basic test that should pass if the solution is correct
self.assertTrue(True, "Basic assertion failed")
def test_expected_output(self):
# Test expected output of main functions
# This will need to be updated based on the specific problem
pass
if __name__ == '__main__':
unittest.main()
"""
def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str:
"""
Generate a test for an edge case.
Args:
edge_case: The edge case to test
code_context: The code context containing information about the problem
Returns:
A test script for the edge case
"""
# Extract function names from the code context
function_names = []
if "code" in code_context:
function_names = re.findall(r'def\s+(\w+)', code_context["code"])
if not function_names:
return None
# Choose a function to test
function_name = random.choice(function_names)
# Generate test code based on the edge case
if edge_case == "empty collections":
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class EmptyCollectionTest(unittest.TestCase):
def test_empty_input(self):
# Test with empty list
result = {function_name}([])
self.assertIsNotNone(result, "Function should handle empty list")
# Test with empty string
result = {function_name}("")
self.assertIsNotNone(result, "Function should handle empty string")
# Test with empty dict
result = {function_name}({{}})
self.assertIsNotNone(result, "Function should handle empty dict")
if __name__ == '__main__':
unittest.main()
"""
elif edge_case == "null/None values":
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class NoneValueTest(unittest.TestCase):
def test_none_input(self):
# Test with None as input
result = {function_name}(None)
self.assertIsNotNone(result, "Function should handle None input")
# Test with list containing None
result = {function_name}([1, None, 3])
self.assertIsNotNone(result, "Function should handle list with None values")
if __name__ == '__main__':
unittest.main()
"""
elif edge_case == "boundary values (min/max)":
return f"""
# recursive_swe_bench/task_generators/bug_fixing.py (completion)
import unittest
import sys
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class BoundaryValueTest(unittest.TestCase):
def test_min_max_values(self):
# Test with minimum integer
min_int = -sys.maxsize - 1
result = {function_name}(min_int)
self.assertIsNotNone(result, "Function should handle minimum integer")
# Test with maximum integer
max_int = sys.maxsize
result = {function_name}(max_int)
self.assertIsNotNone(result, "Function should handle maximum integer")
# Test with very large list
large_list = list(range(10000))
result = {function_name}(large_list)
self.assertIsNotNone(result, "Function should handle very large inputs")
if __name__ == '__main__':
unittest.main()
"""
elif edge_case == "negative numbers":
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class NegativeNumberTest(unittest.TestCase):
def test_negative_numbers(self):
# Test with negative number
result = {function_name}(-1)
self.assertIsNotNone(result, "Function should handle negative numbers")
# Test with list of negative numbers
result = {function_name}([-1, -2, -3])
self.assertIsNotNone(result, "Function should handle lists of negative numbers")
# Test with mixed positive and negative
result = {function_name}([-1, 0, 1])
self.assertIsNotNone(result, "Function should handle mixed positive and negative")
if __name__ == '__main__':
unittest.main()
"""
else:
# Generic edge case test
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class EdgeCaseTest(unittest.TestCase):
def test_edge_case_{edge_case.replace(' ', '_')}(self):
# Test edge case: {edge_case}
# This is a placeholder test that needs to be customized for the specific edge case
self.assertTrue(True, "Edge case test not implemented")
if __name__ == '__main__':
unittest.main()
"""
def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str:
"""
Generate a performance test based on a constraint.
Args:
constraint: The performance constraint
code_context: The code context containing information about the problem
Returns:
A test script for the performance constraint
"""
# Extract function names from the code context
function_names = []
if "code" in code_context:
function_names = re.findall(r'def\s+(\w+)', code_context["code"])
if not function_names:
return None
# Choose a function to test
function_name = random.choice(function_names)
if "time complexity" in constraint:
return f"""
import unittest
import sys
import os
import time
import random
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class PerformanceTest(unittest.TestCase):
def test_time_complexity(self):
# Test for {constraint}
sizes = [100, 1000, 10000]
times = []
for size in sizes:
# Generate input of the given size
input_data = [random.randint(0, 1000) for _ in range(size)]
# Measure execution time
start_time = time.time()
{function_name}(input_data)
end_time = time.time()
times.append(end_time - start_time)
# Check if time grows appropriately
# For O(n), time should grow linearly with input size
# For O(log n), time should grow logarithmically
# This is a simplified check and might need adjustment
if "log n" in "{constraint}":
# For logarithmic time, the ratio of times should decrease
ratio1 = times[1] / times[0]
ratio2 = times[2] / times[1]
self.assertLess(ratio2, ratio1 * 1.5,
f"Growth rate appears super-logarithmic: {times}")
else: # Assume linear or better
# For linear time, the ratio of times should be roughly equal to ratio of sizes
ratio1 = times[1] / times[0]
size_ratio1 = sizes[1] / sizes[0]
ratio2 = times[2] / times[1]
size_ratio2 = sizes[2] / sizes[1]
self.assertLess(ratio1, size_ratio1 * 1.5,
f"First growth rate appears super-linear: {times}")
self.assertLess(ratio2, size_ratio2 * 1.5,
f"Second growth rate appears super-linear: {times}")
if __name__ == '__main__':
unittest.main()
"""
elif "execution time" in constraint:
return f"""
import unittest
import sys
import os
import time
import random
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class PerformanceTest(unittest.TestCase):
def test_execution_time(self):
# Test for {constraint}
# Generate a large input
input_data = [random.randint(0, 1000) for _ in range(10000)]
# Measure execution time
start_time = time.time()
{function_name}(input_data)
end_time = time.time()
execution_time = (end_time - start_time) * 1000 # Convert to ms
self.assertLess(execution_time, 100,
f"Execution time exceeded 100ms: {execution_time:.2f}ms")
if __name__ == '__main__':
unittest.main()
"""
elif "memory usage" in constraint:
return f"""
import unittest
import sys
import os
import psutil
import random
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class MemoryUsageTest(unittest.TestCase):
def test_memory_usage(self):
# Test for {constraint}
# Note: This is an approximate test and may not be accurate in all environments
# Get current process
process = psutil.Process(os.getpid())
# Measure memory before
memory_before = process.memory_info().rss / 1024 / 1024 # MB
# Generate a large input
input_data = [random.randint(0, 1000) for _ in range(100000)]
# Run function
{function_name}(input_data)
# Measure memory after
memory_after = process.memory_info().rss / 1024 / 1024 # MB
# Calculate memory usage
memory_used = memory_after - memory_before
# A crude approximation, adjust as needed
self.assertLess(memory_used, 10,
f"Memory usage seems high: {memory_used:.2f}MB")
if __name__ == '__main__':
unittest.main()
"""
else:
# Generic performance test
return f"""
import unittest
import sys
import os
import time
import random
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class PerformanceTest(unittest.TestCase):
def test_performance(self):
# Test for {constraint}
# This is a placeholder test that needs to be customized for the specific constraint
# Generate a large input
input_data = [random.randint(0, 1000) for _ in range(10000)]
# Measure execution time
start_time = time.time()
{function_name}(input_data)
end_time = time.time()
execution_time = end_time - start_time
# Just log the time for now
print(f"Execution time: {execution_time:.4f} seconds")
self.assertTrue(True, "Performance test completed")
if __name__ == '__main__':
unittest.main()
"""
def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str:
"""
Generate a test for expanded functionality.
Args:
expansion: The functionality expansion
code_context: The code context containing information about the problem
Returns:
A test script for the expanded functionality
"""
# Extract function names from the code context
function_names = []
if "code" in code_context:
function_names = re.findall(r'def\s+(\w+)', code_context["code"])
if not function_names:
return None
# Choose a function to test
function_name = random.choice(function_names)
if "different input types" in expansion:
return f"""
import unittest
import sys
import os
import json
from collections import namedtuple
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class InputTypesTest(unittest.TestCase):
def test_different_input_types(self):
# Test with different types of inputs
# Test with list
list_input = [1, 2, 3]
list_result = {function_name}(list_input)
self.assertIsNotNone(list_result, "Function should handle list input")
# Test with tuple
tuple_input = (1, 2, 3)
tuple_result = {function_name}(tuple_input)
self.assertIsNotNone(tuple_result, "Function should handle tuple input")
# Test with set
set_input = {{1, 2, 3}}
set_result = {function_name}(set_input)
self.assertIsNotNone(set_result, "Function should handle set input")
# Test with dictionary
dict_input = {{"a": 1, "b": 2, "c": 3}}
dict_result = {function_name}(dict_input)
self.assertIsNotNone(dict_result, "Function should handle dictionary input")
# Test with JSON string
json_input = '{{"data": [1, 2, 3]}}'
json_result = {function_name}(json_input)
self.assertIsNotNone(json_result, "Function should handle JSON string")
# Test with custom object
Point = namedtuple('Point', ['x', 'y'])
obj_input = Point(1, 2)
obj_result = {function_name}(obj_input)
self.assertIsNotNone(obj_result, "Function should handle custom object")
if __name__ == '__main__':
unittest.main()
"""
elif "parameterized behavior" in expansion:
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class ParameterizedTest(unittest.TestCase):
def test_parameterized_behavior(self):
# Test function with different parameters
# Base case with default parameters
base_input = [1, 2, 3]
base_result = {function_name}(base_input)
# The function should now accept additional parameters
# These are example parameters, adjust based on the specific function
# With sorting parameter
try:
sorted_result = {function_name}(base_input, sort=True)
self.assertIsNotNone(sorted_result, "Function should handle sort parameter")
except TypeError as e:
self.fail(f"Function does not support sort parameter: {{e}}")
# With filtering parameter
try:
filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1)
self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter")
except TypeError as e:
self.fail(f"Function does not support filter_fn parameter: {{e}}")
# With formatting parameter
try:
formatted_result = {function_name}(base_input, format="json")
self.assertIsNotNone(formatted_result, "Function should handle format parameter")
except TypeError as e:
self.fail(f"Function does not support format parameter: {{e}}")
if __name__ == '__main__':
unittest.main()
"""
elif "additional output formats" in expansion:
return f"""
import unittest
import sys
import os
import json
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class OutputFormatsTest(unittest.TestCase):
def test_output_formats(self):
# Test function with different output formats
input_data = [1, 2, 3]
# Original format
original_result = {function_name}(input_data)
# The function should now support different output formats
# These are example formats, adjust based on the specific function
# JSON format
try:
json_result = {function_name}(input_data, format="json")
# Check if it's valid JSON
try:
json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result
self.assertIsNotNone(json_obj, "JSON result should be valid")
except json.JSONDecodeError:
self.fail("JSON result is not valid")
except TypeError as e:
self.fail(f"Function does not support JSON format: {{e}}")
# CSV format
try:
csv_result = {function_name}(input_data, format="csv")
self.assertIsNotNone(csv_result, "CSV result should not be None")
if isinstance(csv_result, str):
self.assertIn(",", csv_result, "CSV result should contain commas")
except TypeError as e:
self.fail(f"Function does not support CSV format: {{e}}")
# XML format
try:
xml_result = {function_name}(input_data, format="xml")
self.assertIsNotNone(xml_result, "XML result should not be None")
if isinstance(xml_result, str):
self.assertIn("<", xml_result, "XML result should contain tags")
self.assertIn(">", xml_result, "XML result should contain tags")
except TypeError as e:
self.fail(f"Function does not support XML format: {{e}}")
if __name__ == '__main__':
unittest.main()
"""
else:
# Generic functionality expansion test
return f"""
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {function_name}
class ExpandedFunctionalityTest(unittest.TestCase):
def test_expanded_functionality(self):
# Test for {expansion}
# This is a placeholder test that needs to be customized for the specific expansion
# Basic test to verify the function exists
input_data = [1, 2, 3]
result = {function_name}(input_data)
self.assertIsNotNone(result, "Function should return a result")
# You need to add specific tests for the expanded functionality
if __name__ == '__main__':
unittest.main()
"""
def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]:
"""
Calculate an adaptation vector based on the solution, result, and feedback.
The adaptation vector encodes how the problem should evolve in future iterations,
capturing dimensions like difficulty, bug type emphasis, and feedback focus.
Args:
solution: The current solution
result: The evaluation results
feedback: The feedback provided
Returns:
An adaptation vector (list of floats)
"""
# Initialize adaptation vector with zeros
# Dimensions:
# [0] - difficulty adjustment
# [1] - syntax vs logical bug emphasis
# [2] - performance focus
# [3] - edge case focus
# [4] - requirement expansion
adaptation_vector = [0.0] * 5
# Adjust difficulty based on score
if result.score > 0.95:
adaptation_vector[0] = 0.2 # Increase difficulty significantly
elif result.score > 0.8:
adaptation_vector[0] = 0.1 # Increase difficulty moderately
elif result.score > 0.6:
adaptation_vector[0] = 0.0 # Maintain current difficulty
elif result.score > 0.4:
adaptation_vector[0] = -0.1 # Decrease difficulty moderately
else:
adaptation_vector[0] = -0.2 # Decrease difficulty significantly
# Adjust bug type emphasis based on error types
syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax")
logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure")
if syntax_issues > logical_issues:
adaptation_vector[1] = -0.1 # Move toward more logical bugs
elif logical_issues > syntax_issues:
adaptation_vector[1] = 0.1 # Move toward more syntax bugs
# Adjust performance focus based on execution time and metrics
if result.metrics and "execution_time" in result.metrics:
if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0):
adaptation_vector[2] = 0.2 # Increase performance focus
else:
adaptation_vector[2] = -0.1 # Decrease performance focus
# Adjust edge case focus based on test failures
if result.test_results:
edge_case_failures = sum(1 for test_name, test_result in result.test_results.items()
if not test_result["passed"] and "edge" in test_name.lower())
if edge_case_failures > 0:
adaptation_vector[3] = 0.2 # Increase edge case focus
else:
adaptation_vector[3] = 0.0 # Maintain current edge case focus
# Adjust requirement expansion based on current state
current_requirements = len(self.state.requirements)
if current_requirements < 3:
adaptation_vector[4] = 0.1 # Increase likelihood of adding requirements
elif current_requirements >= 5:
adaptation_vector[4] = -0.1 # Decrease likelihood of adding requirements
return adaptation_vector
class DefaultTestRunner:
"""Default test runner for evaluating bug fixes."""
def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]:
"""
Run tests against a solution file.
Args:
solution_file: Path to the solution file
test_files: List of test file paths
code_context: Context information about the code
Returns:
Dictionary of test results
"""
# Initialize results
results = {
"all_passed": True,
"passed_tests": 0,
"total_tests": 0,
"tests": {},
"execution": {
"success": True,
"error": None,
"stdout": None,
"stderr": None
},
"execution_time": 0.0
}
# Import the solution to check for syntax errors
try:
# Check if the solution file exists
if not solution_file.exists():
results["execution"]["success"] = False
results["execution"]["error"] = "Solution file not found"
results["all_passed"] = False
return results
# Try to import the module to test for syntax errors
sys.path.insert(0, str(solution_file.parent))
import importlib.util
spec = importlib.util.spec_from_file_location("solution", solution_file)
solution_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(solution_module)
# Check for required functions
if "required_functions" in code_context:
for func_name in code_context["required_functions"]:
if not hasattr(solution_module, func_name):
results["execution"]["success"] = False
results["execution"]["error"] = f"Required function '{func_name}' not found"
results["all_passed"] = False
return results
except Exception as e:
results["execution"]["success"] = False
results["execution"]["error"] = str(e)
results["all_passed"] = False
return results
# Run each test file
for test_file in test_files:
# Skip if the test file doesn't exist
if not test_file.exists():
continue
# Run the test file
import unittest
import io
from contextlib import redirect_stdout, redirect_stderr
# Create a test loader and find tests in the file
loader = unittest.TestLoader()
try:
tests = loader.discover(str(test_file.parent), pattern=test_file.name)
# Count the number of test cases
test_cases = 0
for suite in tests:
for test_case in suite:
test_cases += test_case.countTestCases()
results["total_tests"] += test_cases
# Run the tests
runner = unittest.TextTestRunner(verbosity=2)
# Capture stdout and stderr
stdout_buffer = io.StringIO()
stderr_buffer = io.StringIO()
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
test_result = runner.run(tests)
stdout = stdout_buffer.getvalue()
stderr = stderr_buffer.getvalue()
# Check if all tests passed
if not test_result.wasSuccessful():
results["all_passed"] = False
# Count passed tests
passed_tests = test_cases - len(test_result.failures) - len(test_result.errors)
results["passed_tests"] += passed_tests
# Store individual test results
test_name = test_file.stem
results["tests"][test_name] = {
"passed": test_result.wasSuccessful(),
"failures": len(test_result.failures),
"errors": len(test_result.errors),
"skipped": len(test_result.skipped),
"total": test_cases,
"passed_count": passed_tests,
"stdout": stdout,
"stderr": stderr
}
# Extract more detailed information about failures
for failure in test_result.failures:
test_id = failure[0].id()
failure_message = failure[1]
# Extract expected and actual values if available
import re
expected_match = re.search(r'Expected\s*:(.+)', failure_message)
actual_match = re.search(r'Actual\s*:(.+)', failure_message)
expected = expected_match.group(1).strip() if expected_match else None
actual = actual_match.group(1).strip() if actual_match else None
if test_id not in results["tests"]:
results["tests"][test_id] = {}
results["tests"][test_id].update({
"passed": False,
"message": failure_message,
"expected": expected,
"actual": actual
})
except Exception as e:
# If the test file itself has errors
results["all_passed"] = False
results["tests"][test_file.stem] = {
"passed": False,
"error": str(e),
"failures": 1,
"errors": 1,
"skipped": 0,
"total": 1,
"passed_count": 0
}
results["total_tests"] += 1
return results
class BugFixingTaskGenerator:
"""Generator for bug fixing tasks."""
def __init__(self, config: Dict[str, Any] = None):
"""
Initialize the bug fixing task generator.
Args:
config: Configuration options
"""
self.config = config or {}
self.difficulty_levels = self.config.get(
"difficulty_levels",
["easy", "medium", "hard", "expert"]
)
self.bug_categories = self.config.get(
"bug_categories",
[
BugCategory.SYNTAX,
BugCategory.LOGICAL,
BugCategory.EDGE_CASE,
BugCategory.PERFORMANCE
]
)
self.test_templates = self._load_test_templates()
def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask:
"""
Generate a new bug fixing task.
Args:
difficulty: The difficulty level (easy, medium, hard, expert)
bug_categories: List of bug categories to include
Returns:
A new bug fixing task
"""
# Choose difficulty if not specified
if difficulty is None:
difficulty = random.choice(self.difficulty_levels)
# Choose bug categories if not specified
if bug_categories is None:
num_categories = random.randint(1, 3)
bug_categories = random.sample(self.bug_categories, num_categories)
# Generate a problem based on difficulty and bug categories
problem_state = self._generate_problem_state(difficulty, bug_categories)
# Create config for the task
task_config = {
"difficulty": difficulty,
"bug_categories": bug_categories,
"convergence_criteria": {
"score_threshold": 0.95,
"min_iterations": 1,
"max_iterations": self.config.get("max_iterations", 5),
"score_delta_threshold": 0.05,
"consecutive_plateau_limit": 2
},
"score_weights": {
"test": 0.7,
"execution": 0.3
},
"performance_threshold": 1.0,
"complexity_threshold": 0.7
}
# Create and return the task
return BugFixingTask(problem_state, task_config)
def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState:
"""
Generate a problem state for the given difficulty and bug categories.
Args:
difficulty: The difficulty level
bug_categories: List of bug categories
Returns:
A problem state for the task
"""
# Choose a template based on difficulty and bug categories
template = self._choose_template(difficulty, bug_categories)
# Create a copy of the template
problem_state = copy.deepcopy(template)
# Generate a unique ID
problem_state.problem_id = str(uuid.uuid4())
# Initialize evolution stage and adaptation vector
problem_state.evolution_stage = 0
problem_state.adaptation_vector = [0.0] * 5
# Adjust difficulty value based on level
difficulty_values = {
"easy": 0.25,
"medium": 0.5,
"hard": 0.75,
"expert": 0.9
}
problem_state.difficulty = difficulty_values.get(difficulty, 0.5)
# Insert bugs based on categories
for category in bug_categories:
self._insert_bug(problem_state, category)
# Update description to reflect the current state
problem_state.description = self._generate_description(problem_state)
return problem_state
def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState:
"""
Choose a template that matches the difficulty and bug categories.
Args:
difficulty: The difficulty level
bug_categories: List of bug categories
Returns:
A template problem state
"""
# In a real implementation, this would load from a database of templates
# For now, we'll generate a simple template
# Generate code context with a sample function
code = self._generate_template_code(difficulty, bug_categories)
tests = self._generate_template_tests(code)
# Create a basic problem state
return ProblemState(
problem_id="template",
description="Fix the bugs in the given code.",
code_context={
"code": code,
"tests": tests,
"bug_count": 0,
"bug_categories": []
},
requirements=[
{
"type": "functional",
"description": "The code should pass all the provided tests.",
"difficulty": 0.3
}
],
difficulty=0.5, # Will be overridden
evolution_stage=0,
adaptation_vector=[0.0] * 5
)
def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str:
"""
Generate template code based on difficulty and bug categories.
Args:
difficulty: The difficulty level
bug_categories: List of bug categories
Returns:
Template code
"""
# For demonstration, we'll use a few predefined templates
templates = {
"easy": """
def calculate_sum(numbers):
\"\"\"Calculate the sum of a list of numbers.\"\"\"
total = 0
for num in numbers:
total += num
return total
def calculate_average(numbers):
\"\"\"Calculate the average of a list of numbers.\"\"\"
if not numbers:
return 0
return calculate_sum(numbers) / len(numbers)
""",
"medium": """
def find_most_frequent(items):
\"\"\"Find the most frequently occurring item in
# recursive_swe_bench/task_generators/bug_fixing.py (template generation)
def find_most_frequent(items):
"""Find the most frequently occurring item in a list."""
if not items:
return None
counts = {}
for item in items:
if item in counts:
counts[item] += 1
else:
counts[item] = 1
max_count = 0
max_item = None
for item, count in counts.items():
if count > max_count:
max_count = count
max_item = item
return max_item
def binary_search(sorted_list, target):
"""Perform binary search on a sorted list."""
left = 0
right = len(sorted_list) - 1
while left <= right:
mid = (left + right) // 2
if sorted_list[mid] == target:
return mid
elif sorted_list[mid] < target:
left = mid + 1
else:
right = mid - 1
return -1 # Target not found
""",
"hard": """
def merge_sort(arr):
"""Sort an array using the merge sort algorithm."""
if len(arr) <= 1:
return arr
# Split the array into two halves
mid = len(arr) // 2
left_half = arr[:mid]
right_half = arr[mid:]
# Recursively sort both halves
left_half = merge_sort(left_half)
right_half = merge_sort(right_half)
# Merge the sorted halves
return merge(left_half, right_half)
def merge(left, right):
"""Merge two sorted arrays."""
result = []
i = j = 0
# Compare elements from both arrays and add the smaller one to the result
while i < len(left) and j < len(right):
if left[i] <= right[j]:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1
# Add any remaining elements
result.extend(left[i:])
result.extend(right[j:])
return result
def quicksort(arr):
"""Sort an array using the quicksort algorithm."""
if len(arr) <= 1:
return arr
# Choose the pivot (using the first element for simplicity)
pivot = arr[0]
# Partition the array
less = [x for x in arr[1:] if x <= pivot]
greater = [x for x in arr[1:] if x > pivot]
# Recursively sort the partitions and combine
return quicksort(less) + [pivot] + quicksort(greater)
""",
"expert": """
class Node:
"""Node in a binary tree."""
def __init__(self, value):
self.value = value
self.left = None
self.right = None
def build_binary_tree(values):
"""Build a binary tree from a list of values."""
if not values:
return None
root = Node(values[0])
queue = [root]
i = 1
while queue and i < len(values):
node = queue.pop(0)
# Add left child
if i < len(values) and values[i] is not None:
node.left = Node(values[i])
queue.append(node.left)
i += 1
# Add right child
if i < len(values) and values[i] is not None:
node.right = Node(values[i])
queue.append(node.right)
i += 1
return root
def is_balanced(root):
"""Check if a binary tree is balanced."""
def height(node):
if not node:
return 0
return max(height(node.left), height(node.right)) + 1
def is_balanced_helper(node):
if not node:
return True
left_height = height(node.left)
right_height = height(node.right)
if abs(left_height - right_height) > 1:
return False
return is_balanced_helper(node.left) and is_balanced_helper(node.right)
return is_balanced_helper(root)
def find_lca(root, p, q):
"""Find the lowest common ancestor of two nodes in a binary tree."""
if not root:
return None
if root.value == p or root.value == q:
return root
left_lca = find_lca(root.left, p, q)
right_lca = find_lca(root.right, p, q)
if left_lca and right_lca:
return root
return left_lca if left_lca else right_lca
"""
}
# Choose a template based on difficulty
if difficulty in templates:
return templates[difficulty]
else:
return templates["medium"] # Default to medium if difficulty not found
def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]:
"""
Generate template tests based on the code.
Args:
code: The template code
Returns:
List of test dictionaries
"""
# Extract function names from the code
function_names = re.findall(r'def\s+(\w+)', code)
# Generate tests for each function
tests = []
for func_name in function_names:
test_content = self._generate_test_for_function(func_name)
if test_content:
tests.append({
"name": f"test_{func_name}",
"content": test_content,
"description": f"Test for {func_name} function"
})
return tests
def _generate_test_for_function(self, func_name: str) -> str:
"""
Generate a test for a specific function.
Args:
func_name: The name of the function to test
Returns:
Test content
"""
# Check if we have a template for this function
if func_name in self.test_templates:
return self.test_templates[func_name]
# Generate a basic test based on the function name
if "sum" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_sum
class TestCalculateSum(unittest.TestCase):
def test_calculate_sum(self):
self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15)
self.assertEqual(calculate_sum([]), 0)
self.assertEqual(calculate_sum([-1, -2, -3]), -6)
if __name__ == '__main__':
unittest.main()
"""
elif "average" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_average
class TestCalculateAverage(unittest.TestCase):
def test_calculate_average(self):
self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3)
self.assertEqual(calculate_average([]), 0)
self.assertEqual(calculate_average([10]), 10)
if __name__ == '__main__':
unittest.main()
"""
elif "frequent" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import find_most_frequent
class TestFindMostFrequent(unittest.TestCase):
def test_find_most_frequent(self):
self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3)
self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a')
self.assertIsNone(find_most_frequent([]))
self.assertEqual(find_most_frequent([5]), 5)
if __name__ == '__main__':
unittest.main()
"""
elif "search" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import binary_search
class TestBinarySearch(unittest.TestCase):
def test_binary_search(self):
self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2)
self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0)
self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4)
self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1)
self.assertEqual(binary_search([], 5), -1)
if __name__ == '__main__':
unittest.main()
"""
elif "sort" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {0}
class Test{1}(unittest.TestCase):
def test_sorting(self):
self.assertEqual({0}([]), [])
self.assertEqual({0}([1]), [1])
self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9])
self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9])
self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1])
if __name__ == '__main__':
unittest.main()
""".format(func_name, func_name.title())
elif "balanced" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, is_balanced
class TestIsBalanced(unittest.TestCase):
def test_is_balanced(self):
# Create a balanced tree
# 1
# / \\
# 2 3
# / \\ / \\
# 4 5 6 7
root = Node(1)
root.left = Node(2)
root.right = Node(3)
root.left.left = Node(4)
root.left.right = Node(5)
root.right.left = Node(6)
root.right.right = Node(7)
self.assertTrue(is_balanced(root))
# Create an unbalanced tree
# 1
# / \\
# 2 3
# / \\
# 4 5
#/
#6
root = Node(1)
root.left = Node(2)
root.right = Node(3)
root.left.left = Node(4)
root.left.right = Node(5)
root.left.left.left = Node(6)
self.assertFalse(is_balanced(root))
# Empty tree is balanced
self.assertTrue(is_balanced(None))
if __name__ == '__main__':
unittest.main()
"""
elif "lca" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, find_lca
class TestFindLCA(unittest.TestCase):
def test_find_lca(self):
# Create a tree
# 1
# / \\
# 2 3
# / \\ / \\
# 4 5 6 7
root = Node(1)
root.left = Node(2)
root.right = Node(3)
root.left.left = Node(4)
root.left.right = Node(5)
root.right.left = Node(6)
root.right.right = Node(7)
# Test cases
self.assertEqual(find_lca(root, 4, 5).value, 2) # LCA of 4 and 5 is 2
self.assertEqual(find_lca(root, 4, 6).value, 1) # LCA of 4 and 6 is 1
self.assertEqual(find_lca(root, 3, 7).value, 3) # LCA of 3 and 7 is 3
self.assertEqual(find_lca(root, 2, 7).value, 1) # LCA of 2 and 7 is 1
if __name__ == '__main__':
unittest.main()
"""
elif "tree" in func_name.lower():
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import Node, build_binary_tree
class TestBuildBinaryTree(unittest.TestCase):
def test_build_binary_tree(self):
# Test empty list
self.assertIsNone(build_binary_tree([]))
# Test single node
root = build_binary_tree([1])
self.assertEqual(root.value, 1)
self.assertIsNone(root.left)
self.assertIsNone(root.right)
# Test complete tree
# 1
# / \\
# 2 3
# / \\ / \\
# 4 5 6 7
values = [1, 2, 3, 4, 5, 6, 7]
root = build_binary_tree(values)
self.assertEqual(root.value, 1)
self.assertEqual(root.left.value, 2)
self.assertEqual(root.right.value, 3)
self.assertEqual(root.left.left.value, 4)
self.assertEqual(root.left.right.value, 5)
self.assertEqual(root.right.left.value, 6)
self.assertEqual(root.right.right.value, 7)
# Test tree with None values
# 1
# / \\
# 2 3
# / /
# 4 6
values = [1, 2, 3, 4, None, 6, None]
root = build_binary_tree(values)
self.assertEqual(root.value, 1)
self.assertEqual(root.left.value, 2)
self.assertEqual(root.right.value, 3)
self.assertEqual(root.left.left.value, 4)
self.assertIsNone(root.left.right)
self.assertEqual(root.right.left.value, 6)
self.assertIsNone(root.right.right)
if __name__ == '__main__':
unittest.main()
"""
else:
# Generic test template
return """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import {0}
class Test{1}(unittest.TestCase):
def test_{0}(self):
# TODO: Add specific test cases for {0}
# This is a placeholder test
self.assertTrue(True)
if __name__ == '__main__':
unittest.main()
""".format(func_name, func_name.title())
def _load_test_templates(self) -> Dict[str, str]:
"""
Load test templates for common functions.
Returns:
Dictionary of test templates
"""
# In a real implementation, these would be loaded from files
return {
"calculate_sum": """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_sum
class TestCalculateSum(unittest.TestCase):
def test_calculate_sum(self):
self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15)
self.assertEqual(calculate_sum([]), 0)
self.assertEqual(calculate_sum([-1, -2, -3]), -6)
if __name__ == '__main__':
unittest.main()
""",
"calculate_average": """
import unittest
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from solution import calculate_average
class TestCalculateAverage(unittest.TestCase):
def test_calculate_average(self):
self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3)
self.assertEqual(calculate_average([]), 0)
self.assertEqual(calculate_average([10]), 10)
if __name__ == '__main__':
unittest.main()
"""
}
def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None:
"""
Insert a bug of the specified category into the problem state.
Args:
problem_state: The problem state to modify
bug_category: The category of bug to insert
"""
if "code" not in problem_state.code_context:
return
# Parse the code to find potential bug insertion points
code = problem_state.code_context["code"]
try:
parsed_code = ast.parse(code)
except SyntaxError:
# If the code already has syntax errors, don't add more bugs
return
# Insert different types of bugs based on the category
if bug_category == BugCategory.SYNTAX:
self._insert_syntax_bug(problem_state)
elif bug_category == BugCategory.LOGICAL:
self._insert_logical_bug(problem_state)
elif bug_category == BugCategory.PERFORMANCE:
self._insert_performance_bug(problem_state)
elif bug_category == BugCategory.EDGE_CASE:
self._insert_edge_case_bug(problem_state)
else:
# Default to logical bug
self._insert_logical_bug(problem_state)
# Update bug count and categories
if "bug_count" not in problem_state.code_context:
problem_state.code_context["bug_count"] = 0
problem_state.code_context["bug_count"] += 1
if "bug_categories" not in problem_state.code_context:
problem_state.code_context["bug_categories"] = []
if bug_category not in problem_state.code_context["bug_categories"]:
problem_state.code_context["bug_categories"].append(bug_category)
def _insert_syntax_bug(self, problem_state: ProblemState) -> None:
"""
Insert a syntax bug into the problem state.
Args:
problem_state: The problem state to modify
"""
code = problem_state.code_context["code"]
lines = code.split('\n')
if not lines:
return
# Choose a non-empty line to modify
idx = random.randint(0, len(lines) - 1)
line = lines[idx]
# Skip empty lines or comment lines
attempts = 0
while (not line.strip() or line.strip().startswith('#')) and attempts < 10:
idx = random.randint(0, len(lines) - 1)
line = lines[idx]
attempts += 1
if attempts >= 10:
# Couldn't find a suitable line, use the first non-empty line
for i, line in enumerate(lines):
if line.strip() and not line.strip().startswith('#'):
idx = i
break
else:
return # No suitable line found
# Choose a modification type
mod_type = random.choice([
"remove_character",
"add_character",
"swap_characters",
"change_indent"
])
if mod_type == "remove_character" and line:
char_idx = random.randint(0, len(line) - 1)
lines[idx] = line[:char_idx] + line[char_idx+1:]
elif mod_type == "add_character":
char_idx = random.randint(0, len(line))
char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."])
lines[idx] = line[:char_idx] + char + line[char_idx:]
elif mod_type == "swap_characters" and len(line) >= 2:
char_idx = random.randint(0, len(line) - 2)
lines[idx] = (line[:char_idx] + line[char_idx+1] +
line[char_idx] + line[char_idx+2:])
elif mod_type == "change_indent":
# Either add or remove indentation
if line.startswith(" "):
lines[idx] = line[2:] # Remove some indent
else:
lines[idx] = " " + line # Add inconsistent indent
# Update the code
problem_state.code_context["code"] = '\n'.join(lines)
# Add information about the bug
if "bugs" not in problem_state.code_context:
problem_state.code_context["bugs"] = []
problem_state.code_context["bugs"].append({
"type": BugCategory.SYNTAX,
"line": idx + 1,
"description": f"Syntax error introduced in line {idx + 1}"
})
def _insert_logical_bug(self, problem_state: ProblemState) -> None:
"""
Insert a logical bug into the problem state.
Args:
problem_state: The problem state to modify
"""
code = problem_state.code_context["code"]
lines = code.split('\n')
if not lines:
return
# Find all if statements and loops
if_statements = []
for i, line in enumerate(lines):
if re.search(r'\bif\b|\bwhile\b|\bfor\b', line):
if_statements.append((i, line))
# Choose a modification type
mod_type = random.choice([
"change_comparison",
"invert_condition",
"off_by_one",
"change_operator",
"reverse_logic"
])
if if_statements:
# Choose an if statement to modify
idx, line = random.choice(if_statements)
if mod_type == "change_comparison":
# Change comparison operators
comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="}
for op, new_op in comparisons.items():
if op in line:
lines[idx] = line.replace(op, new_op, 1)
break
elif mod_type == "invert_condition":
# Add or remove a "not" to invert the condition
if "not" in line:
lines[idx] = line.replace("not ", "", 1)
else:
match = re.search(r'(if|while)\s+([^:]+):', line)
if match:
condition = match.group(2)
lines[idx] = line.replace(condition, f"not ({condition})", 1)
elif mod_type == "off_by_one":
# Introduce an off-by-one error
for op in ["+", "-"]:
if op in line:
# If there's a number after the operator, change it
match = re.search(f'\\{op}\\s*(\\d+)', line)
if match:
num = int(match.group(1))
new_num = num + 1 if op == "+" else max(0, num - 1)
lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1)
break
elif mod_type == "change_operator":
# Change arithmetic or logical operators
operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"}
for op, new_op in operators.items():
if f" {op} " in line:
lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1)
break
elif mod_type == "reverse_logic":
# Reverse the logic of a compound condition
if " and " in line:
parts = line.split(" and ")
lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1)
elif " or " in line:
parts = line.split(" or ")
lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1)
else:
# If no if statements found, introduce a different kind of logical error
# Find variable assignments
assignments = []
for i, line in enumerate(lines):
if "=" in line and "==" not in line and "!=" not in line:
assignments.append((i, line))
if assignments:
# Choose an assignment to modify
idx, line = random.choice(assignments)
# Modify the assignment
if "+" in line:
lines[idx] = line.replace("+", "-", 1)
elif "-" in line:
lines[idx] = line.replace("-", "+", 1)
elif "*" in line:
lines[idx] = line.replace("*", "/", 1)
elif "/" in line:
lines[idx] = line.replace("/", "*", 1)
else:
# If no arithmetic operator, change the value
match = re.search(r'=\s*(\d+)', line)
if match:
num = int(match.group(1))
new_num = num + random.choice([-1, 1]) * random.randint(1, 3)
lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1)
# Update the code
problem_state.code_context["code"] = '\n'.join(lines)
# Add information about the bug
if "bugs" not in problem_state.code_context:
problem_state.code_context["bugs"] = []
problem_state.code_context["bugs"].append({
"type": BugCategory.LOGICAL,
"line": idx + 1,
"description": f"Logical error introduced in line {idx + 1}"
})
def _insert_performance_bug(self, problem_state: ProblemState) -> None:
"""
Insert a performance bug into the problem state.
Args:
problem_state: The problem state to modify
"""
code = problem_state.code_context["code"]
lines = code.split('\n')
if not lines:
return
# Find functions in the code
functions = []
current_func = None
func_start = None
for i, line in enumerate(lines):
if line.strip().startswith("def "):
if current_func:
functions.append((func_start, i - 1, current_func))
current_func = line.strip()[4:].split("(")[0]
func_start = i
elif i == len(lines) - 1 and current_func:
functions.append((func_start, i, current_func))
if not functions:
return
# Choose a function to modify
start_idx, end_idx, func_name = random.choice(functions)
# Choose a modification type
mod_type = random.choice([
"add_nested_loop",
"inefficient_data_structure",
"redundant_computation"
])
if mod_type == "add_nested_loop":
# Find indentation of the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip():
indent = len(lines[i]) - len(lines[i].lstrip())
break
else:
indent = 4
# Find a suitable place to add a nested loop
for i in range(start_idx + 1, end_idx + 1):
if "for " in lines[i] or "while " in lines[i]:
# Add a nested loop after this loop
inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4
inner_indent_str = ' ' * inner_indent
# Add an unnecessary nested loop
lines.insert(i + 1, f"{inner_indent_str}for _ in range(100): # Inefficient nested loop")
lines.insert(i + 2, f"{inner_indent_str} pass")
# Update indices
end_idx += 2
break
else:
# If no loop found, add one at the beginning of the function
inner_indent = indent + 4
inner_indent_str = ' ' * inner_indent
# Find the first non-docstring line
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
# Add an unnecessary loop
lines.insert(i, f"{' ' * indent}for i in range(100): # Inefficient loop")
lines.insert(i + 1, f"{inner_indent_str}pass")
# Update indices
end_idx += 2
break
elif mod_type == "ineff
# recursive_swe_bench/task_generators/bug_fixing.py (finalized)
elif mod_type == "inefficient_data_structure":
# Find indentation of the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip():
indent = len(lines[i]) - len(lines[i].lstrip())
break
else:
indent = 4
# Find a suitable place to add inefficient data structure usage
for i in range(start_idx + 1, end_idx + 1):
if "def " not in lines[i] and lines[i].strip():
# Add inefficient data structure usage after this line
indent_str = ' ' * indent
# Add inefficient code
lines.insert(i + 1, f"{indent_str}# Inefficient data structure usage")
lines.insert(i + 2, f"{indent_str}results = []")
lines.insert(i + 3, f"{indent_str}for i in range(1000): # Unnecessarily large range")
lines.insert(i + 4, f"{indent_str} # Using list instead of set for lookups")
lines.insert(i + 5, f"{indent_str} if i % 10 in results: # O(n) lookup instead of O(1)")
lines.insert(i + 6, f"{indent_str} results.append(i) # Unnecessary storage")
# Update indices
end_idx += 6
break
elif mod_type == "redundant_computation":
# Find indentation of the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip():
indent = len(lines[i]) - len(lines[i].lstrip())
break
else:
indent = 4
# Find a suitable place to add redundant computation
for i in range(start_idx + 1, end_idx + 1):
if "for " in lines[i] or "while " in lines[i]:
# Add redundant computation inside the loop
inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4
inner_indent_str = ' ' * inner_indent
# Add redundant computation
lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration")
lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0")
lines.insert(i + 3, f"{inner_indent_str}for j in range(100): # Unnecessary nested computation")
lines.insert(i + 4, f"{inner_indent_str} temp_sum += j")
# Update indices
end_idx += 4
break
# Update the code
problem_state.code_context["code"] = '\n'.join(lines)
# Add information about the bug
if "bugs" not in problem_state.code_context:
problem_state.code_context["bugs"] = []
problem_state.code_context["bugs"].append({
"type": BugCategory.PERFORMANCE,
"line": start_idx + 1,
"description": f"Performance issue introduced in function '{func_name}'"
})
def _insert_edge_case_bug(self, problem_state: ProblemState) -> None:
"""
Insert an edge case bug into the problem state.
Args:
problem_state: The problem state to modify
"""
code = problem_state.code_context["code"]
lines = code.split('\n')
if not lines:
return
# Find functions in the code
functions = []
current_func = None
func_start = None
for i, line in enumerate(lines):
if line.strip().startswith("def "):
if current_func:
functions.append((func_start, i - 1, current_func))
current_func = line.strip()[4:].split("(")[0]
func_start = i
elif i == len(lines) - 1 and current_func:
functions.append((func_start, i, current_func))
if not functions:
return
# Choose a function to modify
start_idx, end_idx, func_name = random.choice(functions)
# Choose a modification type
mod_type = random.choice([
"remove_boundary_check",
"missing_edge_case",
"type_assumption"
])
if mod_type == "remove_boundary_check":
# Find boundary checks (if statements with conditions that check boundaries)
boundary_checks = []
for i in range(start_idx + 1, end_idx + 1):
if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and
(("if not " in lines[i]) or ("if len(" in lines[i]) or
("if " in lines[i] and " == 0" in lines[i]) or
("if " in lines[i] and " == []" in lines[i]) or
("if " in lines[i] and " == ''" in lines[i]) or
("if " in lines[i] and " is None" in lines[i]))):
boundary_checks.append(i)
if boundary_checks:
# Choose a boundary check to remove
idx = random.choice(boundary_checks)
# Comment out the boundary check
lines[idx] = f"# {lines[idx]} # Boundary check removed"
# Comment out the body of the if statement
i = idx + 1
while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())):
lines[i] = f"# {lines[i]}"
i += 1
else:
# If no boundary check found, add code that assumes a non-empty input
# Find the first non-docstring line in the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
# Add code that assumes non-empty input
lines.insert(i, f"{indent_str}# Missing check for empty input")
lines.insert(i + 1, f"{indent_str}first_item = items[0] # Will fail on empty input")
# Update indices
end_idx += 2
break
elif mod_type == "missing_edge_case":
# Find a suitable place to insert the bug
for i in range(start_idx + 1, end_idx + 1):
if ("/" in lines[i] or
"if " in lines[i] and "==" in lines[i] or
"if " in lines[i] and "!=" in lines[i]):
if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]):
# Add code that doesn't check for zero division
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
# Extract the denominator
match = re.search(r'/\s*(\w+)', lines[i])
if match:
denominator = match.group(1)
# Comment out any existing check
j = i - 1
while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent:
if f"if {denominator}" in lines[j] and "== 0" in lines[j]:
lines[j] = f"# {lines[j]} # Zero division check removed"
j -= 1
# Add a comment about the missing check
lines.insert(i, f"{indent_str}# Missing check for zero division")
# Update indices
end_idx += 1
break
elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]:
# Comment out edge case check
lines[i] = f"# {lines[i]} # Edge case check removed"
break
else:
# If no suitable place found, add code that doesn't handle an edge case
# Find the first non-docstring line in the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
# Add code that doesn't handle an edge case
lines.insert(i, f"{indent_str}# Missing handling for edge cases")
lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly")
# Update indices
end_idx += 2
break
elif mod_type == "type_assumption":
# Find a suitable place to insert a type assumption bug
for i in range(start_idx + 1, end_idx + 1):
if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]:
# Extract the variable name
var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i])
if not var_match:
var_match = re.search(r'(\w+)\s*=', lines[i])
if var_match:
var_name = var_match.group(1)
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
# Add code that assumes a specific type
lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list")
lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0: # Will fail if {var_name} doesn't support len()")
lines.insert(i + 3, f"{indent_str} first = {var_name}[0] # Will fail if {var_name} is not subscriptable")
# Update indices
end_idx += 3
break
else:
# If no suitable place found, add code at the beginning of the function
for i in range(start_idx + 1, end_idx + 1):
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")):
indent = len(lines[i]) - len(lines[i].lstrip())
indent_str = ' ' * indent
# Extract parameter name
param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx])
param_name = param_match.group(1) if param_match else "input_data"
# Add code that assumes a specific type
lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type")
lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name}) # Will fail if {param_name} can't be converted to string")
# Update indices
end_idx += 2
break
# Update the code
problem_state.code_context["code"] = '\n'.join(lines)
# Add information about the bug
if "bugs" not in problem_state.code_context:
problem_state.code_context["bugs"] = []
problem_state.code_context["bugs"].append({
"type": BugCategory.EDGE_CASE,
"line": start_idx + 1,
"description": f"Edge case bug introduced in function '{func_name}'"
})
def _generate_description(self, problem_state: ProblemState) -> str:
"""
Generate a description for the current problem state.
Args:
problem_state: The problem state
Returns:
A descriptive prompt for the problem
"""
# Base description
bug_count = problem_state.code_context.get("bug_count", 0)
plural = "bugs" if bug_count != 1 else "bug"
base_desc = (
f"Fix the {plural} in the code below. "
f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix."
)
# Add information about bug categories
if "bug_categories" in problem_state.code_context:
categories = problem_state.code_context["bug_categories"]
if categories:
category_desc = ", ".join(categories)
base_desc += f"\n\nThe code contains the following types of issues: {category_desc}."
# Add requirements
if problem_state.requirements:
base_desc += "\n\nRequirements:"
for i, req in enumerate(problem_state.requirements):
base_desc += f"\n{i+1}. {req['description']}"
# Add difficulty level
difficulty_desc = "easy"
if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6:
difficulty_desc = "moderate"
elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8:
difficulty_desc = "challenging"
elif problem_state.difficulty > 0.8:
difficulty_desc = "very challenging"
base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task."
return base_desc
# Default implementation of TestRunner for when no custom runner is provided
class DefaultTestRunner:
"""
Default test runner for evaluating solutions.
This class runs tests against a solution file and collects the results.
"""
def run_tests(
self,
solution_file: Path,
test_files: List[Path],
code_context: Dict[str, Any]
) -> Dict[str, Any]:
"""
Run tests against a solution file.
Args:
solution_file: Path to the solution file
test_files: List of test file paths
code_context: Additional context about the code
Returns:
Dictionary containing test results
"""
# Initialize results dictionary
results = {
"all_passed": True,
"passed_tests": 0,
"total_tests": 0,
"tests": {},
"execution": {
"success": True,
"error": None,
"stdout": "",
"stderr": ""
},
"execution_time": 0.0
}
# Check if solution file exists
if not solution_file.exists():
results["execution"]["success"] = False
results["execution"]["error"] = f"Solution file not found: {solution_file}"
results["all_passed"] = False
return results
# Try to import the solution module
try:
start_time = time.time()
# Add solution directory to path
sys.path.insert(0, str(solution_file.parent))
# Import the solution module
spec = importlib.util.spec_from_file_location(
"solution", solution_file)
solution_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(solution_module)
# Remove the solution directory from path
sys.path.pop(0)
# Record execution time
end_time = time.time()
results["execution_time"] = end_time - start_time
except Exception as e:
results["execution"]["success"] = False
results["execution"]["error"] = str(e)
results["all_passed"] = False
return results
# Run each test file
for test_file in test_files:
# Skip if the test file doesn't exist
if not test_file.exists():
continue
try:
# Set up test loading
loader = unittest.TestLoader()
# Add test directory to path
sys.path.insert(0, str(test_file.parent))
# Capture stdout and stderr
stdout_buffer = io.StringIO()
stderr_buffer = io.StringIO()
# Create a test suite from the test file
test_suite = loader.discover(
str(test_file.parent),
pattern=test_file.name
)
# Count test cases
test_count = 0
for suite in test_suite:
for test_case in suite:
test_count += test_case.countTestCases()
results["total_tests"] += test_count
# Run the tests with captured output
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
test_runner = unittest.TextTestRunner(verbosity=2)
test_result = test_runner.run(test_suite)
# Get the captured output
stdout = stdout_buffer.getvalue()
stderr = stderr_buffer.getvalue()
# Remove the test directory from path
sys.path.pop(0)
# Check if all tests passed
if not test_result.wasSuccessful():
results["all_passed"] = False
# Count passed tests
passed_tests = test_count - len(test_result.failures) - len(test_result.errors)
results["passed_tests"] += passed_tests
# Store individual test results
test_name = test_file.stem
results["tests"][test_name] = {
"passed": test_result.wasSuccessful(),
"failures": len(test_result.failures),
"errors": len(test_result.errors),
"skipped": len(test_result.skipped),
"total": test_count,
"passed_count": passed_tests,
"stdout": stdout,
"stderr": stderr
}
# Store details for individual test failures
for failure in test_result.failures + test_result.errors:
test_id = failure[0].id().split('.')[-1]
failure_message = failure[1]
# Try to extract expected and actual values
expected_match = re.search(r'Expected\s*:(.+)', failure_message)
actual_match = re.search(r'Actual\s*:(.+)', failure_message)
expected = expected_match.group(1).strip() if expected_match else None
actual = actual_match.group(1).strip() if actual_match else None
if test_id not in results["tests"]:
results["tests"][test_id] = {}
results["tests"][test_id].update({
"passed": False,
"message": failure_message,
"expected": expected,
"actual": actual
})
except Exception as e:
# If there's an error in the test file itself
results["all_passed"] = False
test_name = test_file.stem
results["tests"][test_name] = {
"passed": False,
"error": str(e),
"failures": 0,
"errors": 1,
"skipped": 0,
"total": 1,
"passed_count": 0
}
results["total_tests"] += 1
return results