|
|
|
|
|
from typing import Any, Dict, List, Optional, Tuple, Set, Union |
|
import uuid |
|
import json |
|
import re |
|
import random |
|
import ast |
|
import copy |
|
from pathlib import Path |
|
import tempfile |
|
import subprocess |
|
import shutil |
|
import os |
|
|
|
from recursive_swe_bench.core.recursive_task import ( |
|
RecursiveTask, ProblemState, EvaluationResult, Feedback, TaskStatus |
|
) |
|
|
|
class BugCategory: |
|
"""Categories of bugs for classification and evolution.""" |
|
SYNTAX = "syntax" |
|
LOGICAL = "logical" |
|
PERFORMANCE = "performance" |
|
SECURITY = "security" |
|
CONCURRENCY = "concurrency" |
|
EXCEPTION_HANDLING = "exception_handling" |
|
API_USAGE = "api_usage" |
|
MEMORY_MANAGEMENT = "memory_management" |
|
TYPE_ERROR = "type_error" |
|
EDGE_CASE = "edge_case" |
|
DATA_HANDLING = "data_handling" |
|
DEPENDENCY = "dependency" |
|
|
|
|
|
class BugFixingTask(RecursiveTask): |
|
""" |
|
A recursive task for evaluating how models fix bugs in code. |
|
|
|
The task presents a piece of code with one or more bugs, and evolves |
|
based on the model's fix attempts. As the model addresses issues, |
|
the task may introduce more subtle bugs, change requirements, or |
|
increase complexity to test adaptive problem-solving. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
initial_state: ProblemState, |
|
config: Dict[str, Any] = None, |
|
test_runner: Any = None |
|
): |
|
""" |
|
Initialize the bug fixing task. |
|
|
|
Args: |
|
initial_state: The initial problem state |
|
config: Configuration options |
|
test_runner: Custom test runner (optional) |
|
""" |
|
super().__init__(initial_state, config) |
|
self.test_runner = test_runner or DefaultTestRunner() |
|
self.bug_categories: Set[str] = set( |
|
self.config.get("bug_categories", [BugCategory.LOGICAL, BugCategory.SYNTAX]) |
|
) |
|
self.difficulty_progression = self.config.get( |
|
"difficulty_progression", [0.0, 0.15, 0.3, 0.5, 0.7] |
|
) |
|
self.evolution_strategies = self.config.get( |
|
"evolution_strategies", ["add_subtle_bug", "change_requirements", "increase_complexity"] |
|
) |
|
|
|
def _run_evaluation(self, solution: str) -> EvaluationResult: |
|
""" |
|
Run tests to evaluate the solution. |
|
|
|
Args: |
|
solution: The solution code |
|
|
|
Returns: |
|
Evaluation results |
|
""" |
|
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
temp_path = Path(temp_dir) |
|
|
|
|
|
solution_file = temp_path / "solution.py" |
|
with open(solution_file, "w") as f: |
|
f.write(solution) |
|
|
|
|
|
test_files = self._create_test_files(temp_path) |
|
|
|
|
|
results = self.test_runner.run_tests( |
|
solution_file=solution_file, |
|
test_files=test_files, |
|
code_context=self.state.code_context |
|
) |
|
|
|
|
|
score = self._calculate_score(results) |
|
|
|
return EvaluationResult( |
|
success=results["all_passed"], |
|
score=score, |
|
execution_results=results["execution"], |
|
error_details=results.get("errors"), |
|
test_results=results["tests"], |
|
metrics={ |
|
"passed_tests": results["passed_tests"], |
|
"total_tests": results["total_tests"], |
|
"execution_time": results["execution_time"], |
|
"memory_usage": results.get("memory_usage", 0), |
|
"code_complexity": self._calculate_complexity(solution) |
|
} |
|
) |
|
|
|
def _generate_feedback(self, solution: str, result: EvaluationResult) -> Feedback: |
|
""" |
|
Generate structured feedback based on evaluation results. |
|
|
|
Args: |
|
solution: The solution code |
|
result: The evaluation results |
|
|
|
Returns: |
|
Structured feedback |
|
""" |
|
issues = [] |
|
suggestions = [] |
|
focus_areas = [] |
|
|
|
|
|
if result.test_results: |
|
for test_name, test_result in result.test_results.items(): |
|
if not test_result["passed"]: |
|
issues.append({ |
|
"type": "test_failure", |
|
"test": test_name, |
|
"message": test_result.get("message", "Test failed"), |
|
"expected": test_result.get("expected"), |
|
"actual": test_result.get("actual") |
|
}) |
|
|
|
|
|
if result.error_details: |
|
for error_type, error_info in result.error_details.items(): |
|
issues.append({ |
|
"type": "error", |
|
"error_type": error_type, |
|
"message": error_info.get("message", "An error occurred"), |
|
"location": error_info.get("location") |
|
}) |
|
|
|
|
|
for issue in issues: |
|
if issue["type"] == "test_failure": |
|
suggestion = self._generate_suggestion_for_test_failure( |
|
issue, solution, result.test_results |
|
) |
|
if suggestion: |
|
suggestions.append(suggestion) |
|
elif issue["type"] == "error": |
|
suggestion = self._generate_suggestion_for_error( |
|
issue, solution |
|
) |
|
if suggestion: |
|
suggestions.append(suggestion) |
|
|
|
|
|
focus_areas = self._determine_focus_areas(issues, solution, result) |
|
|
|
|
|
adaptation_hints = self._generate_adaptation_hints(solution, result) |
|
|
|
|
|
if result.success: |
|
summary = ( |
|
f"Your solution passes all tests with a score of {result.score:.2f}. " |
|
f"The code successfully addresses the bugs in the original implementation." |
|
) |
|
else: |
|
passed = result.metrics.get("passed_tests", 0) |
|
total = result.metrics.get("total_tests", 0) |
|
summary = ( |
|
f"Your solution passes {passed}/{total} tests with a score of {result.score:.2f}. " |
|
f"There are still issues that need to be addressed." |
|
) |
|
|
|
return Feedback( |
|
summary=summary, |
|
issues=issues, |
|
suggestions=suggestions, |
|
focus_areas=focus_areas, |
|
adaptation_hints=adaptation_hints |
|
) |
|
|
|
def _evolve_state(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
|
""" |
|
Evolve the problem state based on the solution and feedback. |
|
|
|
This method implements the recursive nature of the benchmark by |
|
adapting the problem to challenge the model's understanding. |
|
|
|
Args: |
|
solution: The attempted solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The evolved problem state |
|
""" |
|
|
|
if result.success and result.score > 0.95: |
|
return self._increase_difficulty(solution, result, feedback) |
|
|
|
|
|
elif result.score > 0.7: |
|
return self._focus_remaining_issues(solution, result, feedback) |
|
|
|
|
|
else: |
|
return self._provide_more_guidance(solution, result, feedback) |
|
|
|
def _increase_difficulty(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
|
""" |
|
Increase the difficulty of the problem for models that solved it well. |
|
|
|
Args: |
|
solution: The successful solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The evolved problem state with increased difficulty |
|
""" |
|
|
|
new_state = copy.deepcopy(self.state) |
|
|
|
|
|
new_state.evolution_stage += 1 |
|
|
|
|
|
current_difficulty_idx = min(new_state.evolution_stage, |
|
len(self.difficulty_progression) - 1) |
|
new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
|
|
|
|
|
strategy = self._select_evolution_strategy(solution, result, feedback) |
|
|
|
|
|
if strategy == "add_subtle_bug": |
|
self._add_subtle_bug(new_state, solution) |
|
elif strategy == "change_requirements": |
|
self._change_requirements(new_state, solution) |
|
elif strategy == "increase_complexity": |
|
self._increase_complexity(new_state, solution) |
|
|
|
|
|
new_state.description = self._generate_description(new_state) |
|
|
|
|
|
new_state.adaptation_vector = self._calculate_adaptation_vector( |
|
solution, result, feedback |
|
) |
|
|
|
return new_state |
|
|
|
def _focus_remaining_issues(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
|
""" |
|
Evolve the state to focus on remaining issues when the solution is close but not perfect. |
|
|
|
Args: |
|
solution: The nearly-successful solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The evolved problem state focusing on remaining issues |
|
""" |
|
|
|
new_state = copy.deepcopy(self.state) |
|
|
|
|
|
new_state.evolution_stage += 1 |
|
|
|
|
|
current_difficulty_idx = min(new_state.evolution_stage - 1, |
|
len(self.difficulty_progression) - 1) |
|
new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
|
|
|
|
|
new_state.code_context["focus_areas"] = feedback.focus_areas |
|
|
|
|
|
if result.test_results: |
|
failing_tests = [ |
|
test_name for test_name, test_result in result.test_results.items() |
|
if not test_result["passed"] |
|
] |
|
new_state.code_context["failing_tests"] = failing_tests |
|
|
|
|
|
new_state.description = self._generate_focused_description( |
|
new_state, feedback.issues |
|
) |
|
|
|
|
|
new_state.adaptation_vector = self._calculate_adaptation_vector( |
|
solution, result, feedback |
|
) |
|
|
|
return new_state |
|
|
|
def _provide_more_guidance(self, solution: str, result: EvaluationResult, feedback: Feedback) -> ProblemState: |
|
""" |
|
Evolve the state to provide more guidance when the solution was not very good. |
|
|
|
Args: |
|
solution: The unsuccessful solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The evolved problem state with more guidance |
|
""" |
|
|
|
new_state = copy.deepcopy(self.state) |
|
|
|
|
|
new_state.evolution_stage += 1 |
|
|
|
|
|
current_difficulty_idx = max(0, min(new_state.evolution_stage - 1, |
|
len(self.difficulty_progression) - 1) - 1) |
|
new_state.difficulty = self.difficulty_progression[current_difficulty_idx] |
|
|
|
|
|
new_state.code_context["hints"] = self._generate_hints( |
|
solution, result, feedback |
|
) |
|
|
|
|
|
if result.test_results: |
|
detailed_test_results = {} |
|
for test_name, test_result in result.test_results.items(): |
|
if not test_result["passed"]: |
|
detailed_test_results[test_name] = { |
|
"message": test_result.get("message", "Test failed"), |
|
"expected": test_result.get("expected"), |
|
"actual": test_result.get("actual"), |
|
"hint": self._generate_test_hint(test_name, test_result) |
|
} |
|
new_state.code_context["detailed_test_results"] = detailed_test_results |
|
|
|
|
|
new_state.description = self._generate_guided_description( |
|
new_state, feedback.issues, feedback.suggestions |
|
) |
|
|
|
|
|
new_state.adaptation_vector = self._calculate_adaptation_vector( |
|
solution, result, feedback |
|
) |
|
|
|
return new_state |
|
|
|
def _select_evolution_strategy(self, solution: str, result: EvaluationResult, feedback: Feedback) -> str: |
|
""" |
|
Select an evolution strategy based on the current state and solution. |
|
|
|
Args: |
|
solution: The current solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
The selected evolution strategy |
|
""" |
|
available_strategies = self.evolution_strategies.copy() |
|
|
|
|
|
weights = {} |
|
|
|
|
|
if result.score > 0.95: |
|
weights["add_subtle_bug"] = 0.6 |
|
weights["change_requirements"] = 0.3 |
|
weights["increase_complexity"] = 0.1 |
|
|
|
|
|
elif self.state.evolution_stage >= 2 and "bug_count" in self.state.code_context and self.state.code_context["bug_count"] >= 3: |
|
weights["add_subtle_bug"] = 0.1 |
|
weights["change_requirements"] = 0.7 |
|
weights["increase_complexity"] = 0.2 |
|
|
|
|
|
elif result.score > 0.85: |
|
weights["add_subtle_bug"] = 0.2 |
|
weights["change_requirements"] = 0.2 |
|
weights["increase_complexity"] = 0.6 |
|
|
|
|
|
else: |
|
weights = {strategy: 1.0 / len(available_strategies) |
|
for strategy in available_strategies} |
|
|
|
|
|
total_weight = sum(weights.get(strategy, 0) for strategy in available_strategies) |
|
normalized_weights = [weights.get(strategy, 0) / total_weight |
|
for strategy in available_strategies] |
|
|
|
|
|
return random.choices(available_strategies, weights=normalized_weights)[0] |
|
|
|
def _add_subtle_bug(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Add a subtle bug to the solution code. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
try: |
|
parsed_solution = ast.parse(solution) |
|
except SyntaxError: |
|
|
|
self._add_syntax_error(state, solution) |
|
return |
|
|
|
|
|
available_categories = list(self.bug_categories) |
|
if available_categories: |
|
bug_category = random.choice(available_categories) |
|
else: |
|
bug_category = BugCategory.LOGICAL |
|
|
|
|
|
if bug_category == BugCategory.SYNTAX: |
|
self._add_syntax_error(state, solution) |
|
elif bug_category == BugCategory.LOGICAL: |
|
self._add_logical_error(state, solution, parsed_solution) |
|
elif bug_category == BugCategory.PERFORMANCE: |
|
self._add_performance_issue(state, solution, parsed_solution) |
|
elif bug_category == BugCategory.EDGE_CASE: |
|
self._add_edge_case_issue(state, solution, parsed_solution) |
|
else: |
|
|
|
self._add_logical_error(state, solution, parsed_solution) |
|
|
|
|
|
if "bug_count" not in state.code_context: |
|
state.code_context["bug_count"] = 0 |
|
state.code_context["bug_count"] += 1 |
|
|
|
|
|
if "bug_categories" not in state.code_context: |
|
state.code_context["bug_categories"] = [] |
|
state.code_context["bug_categories"].append(bug_category) |
|
|
|
def _change_requirements(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Change the requirements to challenge the current solution. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
requirements = state.requirements |
|
|
|
|
|
new_requirement = self._generate_new_requirement(state, solution) |
|
if new_requirement: |
|
requirements.append(new_requirement) |
|
|
|
|
|
if requirements and random.random() < 0.5: |
|
idx = random.randint(0, len(requirements) - 1) |
|
requirements[idx] = self._modify_requirement(requirements[idx], state, solution) |
|
|
|
def _increase_complexity(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Increase the complexity of the task. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
try: |
|
parsed_solution = ast.parse(solution) |
|
except SyntaxError: |
|
|
|
self._add_edge_case_requirement(state) |
|
return |
|
|
|
|
|
strategies = [ |
|
"add_edge_cases", |
|
"increase_data_volume", |
|
"add_performance_constraint", |
|
"expand_functionality" |
|
] |
|
|
|
strategy = random.choice(strategies) |
|
|
|
if strategy == "add_edge_cases": |
|
self._add_edge_case_requirement(state) |
|
elif strategy == "increase_data_volume": |
|
self._increase_data_volume(state, solution) |
|
elif strategy == "add_performance_constraint": |
|
self._add_performance_constraint(state, solution) |
|
elif strategy == "expand_functionality": |
|
self._expand_functionality(state, solution) |
|
|
|
def _create_test_files(self, temp_path: Path) -> List[Path]: |
|
""" |
|
Create test files based on the current problem state. |
|
|
|
Args: |
|
temp_path: The temporary directory path |
|
|
|
Returns: |
|
List of test file paths |
|
""" |
|
test_files = [] |
|
|
|
|
|
if "tests" in self.state.code_context: |
|
for i, test in enumerate(self.state.code_context["tests"]): |
|
test_file = temp_path / f"test_{i}.py" |
|
with open(test_file, "w") as f: |
|
f.write(test["content"]) |
|
test_files.append(test_file) |
|
|
|
|
|
if not test_files: |
|
test_file = temp_path / "test_default.py" |
|
with open(test_file, "w") as f: |
|
f.write(self._generate_default_test()) |
|
test_files.append(test_file) |
|
|
|
return test_files |
|
|
|
def _calculate_score(self, results: Dict[str, Any]) -> float: |
|
""" |
|
Calculate a score based on test results. |
|
|
|
Args: |
|
results: The test results |
|
|
|
Returns: |
|
A score between 0 and 1 |
|
""" |
|
|
|
if results["total_tests"] == 0: |
|
test_score = 0.0 |
|
else: |
|
test_score = results["passed_tests"] / results["total_tests"] |
|
|
|
|
|
execution_score = 1.0 if results["execution"]["success"] else 0.0 |
|
|
|
|
|
weights = self.config.get("score_weights", {"test": 0.7, "execution": 0.3}) |
|
score = (test_score * weights["test"] + execution_score * weights["execution"]) |
|
|
|
|
|
difficulty_modifier = 1.0 + (self.state.difficulty * 0.2) |
|
score = score / difficulty_modifier |
|
|
|
return max(0.0, min(1.0, score)) |
|
|
|
def _calculate_complexity(self, code: str) -> float: |
|
""" |
|
Calculate the complexity of code. |
|
|
|
Args: |
|
code: The code to analyze |
|
|
|
Returns: |
|
A complexity score |
|
""" |
|
|
|
complexity = 1 |
|
|
|
|
|
for pattern in ["if", "for", "while", "and", "or"]: |
|
complexity += code.count(f" {pattern} ") |
|
|
|
|
|
complexity += code.count("def ") |
|
|
|
|
|
normalized = min(1.0, complexity / 50.0) |
|
|
|
return normalized |
|
|
|
def _generate_suggestion_for_test_failure( |
|
self, |
|
issue: Dict[str, Any], |
|
solution: str, |
|
test_results: Dict[str, Any] |
|
) -> Dict[str, Any]: |
|
""" |
|
Generate a suggestion for a test failure. |
|
|
|
Args: |
|
issue: The issue data |
|
solution: The solution code |
|
test_results: The test results |
|
|
|
Returns: |
|
A suggestion dictionary |
|
""" |
|
test_name = issue["test"] |
|
test_result = test_results[test_name] |
|
|
|
|
|
test_content = None |
|
for test in self.state.code_context.get("tests", []): |
|
if test.get("name") == test_name: |
|
test_content = test.get("content") |
|
break |
|
|
|
if test_content: |
|
|
|
assertion_match = re.search(r"assert.*", test_content) |
|
assertion = assertion_match.group(0) if assertion_match else None |
|
|
|
|
|
test_funcs = re.findall(r"def\s+(\w+)", test_content) |
|
solution_funcs = re.findall(r"def\s+(\w+)", solution) |
|
|
|
|
|
missing_funcs = [f for f in test_funcs if f not in solution_funcs] |
|
|
|
if missing_funcs: |
|
return { |
|
"type": "missing_function", |
|
"message": f"Implement the missing function(s): {', '.join(missing_funcs)}", |
|
"functions": missing_funcs |
|
} |
|
elif assertion: |
|
return { |
|
"type": "fix_assertion_failure", |
|
"message": f"Fix the code to pass the assertion: {assertion}", |
|
"assertion": assertion, |
|
"expected": test_result.get("expected"), |
|
"actual": test_result.get("actual") |
|
} |
|
else: |
|
return { |
|
"type": "fix_test_failure", |
|
"message": f"Fix the code to pass the test: {test_name}", |
|
"test_name": test_name |
|
} |
|
else: |
|
return { |
|
"type": "general_fix", |
|
"message": f"Fix the code to pass the failing test: {test_name}" |
|
} |
|
|
|
def _generate_suggestion_for_error( |
|
self, |
|
issue: Dict[str, Any], |
|
solution: str |
|
) -> Dict[str, Any]: |
|
""" |
|
Generate a suggestion for an error. |
|
|
|
Args: |
|
issue: The issue data |
|
solution: The solution code |
|
|
|
Returns: |
|
A suggestion dictionary |
|
""" |
|
error_type = issue["error_type"] |
|
message = issue["message"] |
|
location = issue.get("location") |
|
|
|
if error_type == "syntax": |
|
return { |
|
"type": "fix_syntax", |
|
"message": f"Fix the syntax error: {message}", |
|
"location": location |
|
} |
|
elif error_type == "runtime": |
|
return { |
|
"type": "fix_runtime_error", |
|
"message": f"Fix the runtime error: {message}", |
|
"location": location |
|
} |
|
else: |
|
return { |
|
"type": "fix_error", |
|
"message": f"Fix the error: {message}", |
|
"error_type": error_type, |
|
"location": location |
|
} |
|
|
|
def _determine_focus_areas( |
|
self, |
|
issues: List[Dict[str, Any]], |
|
solution: str, |
|
result: EvaluationResult |
|
) -> List[str]: |
|
""" |
|
Determine focus areas based on issues and results. |
|
|
|
Args: |
|
issues: The identified issues |
|
solution: The solution code |
|
result: The evaluation results |
|
|
|
Returns: |
|
List of focus areas |
|
""" |
|
focus_areas = [] |
|
|
|
|
|
syntax_issues = [i for i in issues if i.get("error_type") == "syntax"] |
|
if syntax_issues: |
|
focus_areas.append("syntax") |
|
|
|
|
|
test_issues = [i for i in issues if i["type"] == "test_failure"] |
|
if test_issues: |
|
if any("expected" in i and "actual" in i for i in test_issues): |
|
focus_areas.append("logic") |
|
else: |
|
focus_areas.append("functionality") |
|
|
|
|
|
if result.metrics and "execution_time" in result.metrics: |
|
if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
|
focus_areas.append("performance") |
|
|
|
|
|
if result.metrics and "code_complexity" in result.metrics: |
|
if result.metrics["code_complexity"] > self.config.get("complexity_threshold", 0.7): |
|
focus_areas.append("complexity") |
|
|
|
|
|
if not focus_areas: |
|
focus_areas.append("general") |
|
|
|
return focus_areas |
|
|
|
def _generate_adaptation_hints( |
|
self, |
|
solution: str, |
|
result: EvaluationResult |
|
) -> List[Dict[str, Any]]: |
|
""" |
|
Generate hints about how the problem might adapt in the next iteration. |
|
|
|
Args: |
|
solution: The solution code |
|
result: The evaluation results |
|
|
|
Returns: |
|
List of adaptation hints |
|
""" |
|
hints = [] |
|
|
|
|
|
if result.score > 0.8: |
|
hints.append({ |
|
"type": "complexity_increase", |
|
"message": "The problem may become more complex in the next iteration." |
|
}) |
|
|
|
|
|
if result.score > 0.9 and self.state.evolution_stage >= 1: |
|
hints.append({ |
|
"type": "requirement_change", |
|
"message": "The requirements may change in the next iteration." |
|
}) |
|
|
|
|
|
if result.score > 0.95: |
|
hints.append({ |
|
"type": "new_bugs", |
|
"message": "New, more subtle bugs may be introduced in the next iteration." |
|
}) |
|
|
|
|
|
if result.score > 0.7 and result.score < 0.95: |
|
focus_areas = result.metrics.get("focus_areas", []) |
|
if focus_areas: |
|
hints.append({ |
|
"type": "focus_shift", |
|
"message": f"The next iteration may focus more on: {', '.join(focus_areas)}", |
|
"areas": focus_areas |
|
}) |
|
|
|
return hints |
|
|
|
def _generate_description(self, state: ProblemState) -> str: |
|
""" |
|
Generate a description for the current problem state. |
|
|
|
Args: |
|
state: The problem state |
|
|
|
Returns: |
|
A descriptive prompt for the problem |
|
""" |
|
|
|
base_desc = ( |
|
f"Fix the bug(s) in the following code. " |
|
f"This is iteration {state.evolution_stage + 1} of the task." |
|
) |
|
|
|
|
|
if "bug_categories" in state.code_context: |
|
categories = state.code_context["bug_categories"] |
|
if categories: |
|
base_desc += f"\n\nThe code contains the following types of issues: {', '.join(categories)}." |
|
|
|
|
|
if state.requirements: |
|
base_desc += "\n\nRequirements:" |
|
for i, req in enumerate(state.requirements): |
|
base_desc += f"\n{i+1}. {req['description']}" |
|
|
|
|
|
difficulty_desc = "easy" |
|
if state.difficulty > 0.3 and state.difficulty <= 0.6: |
|
difficulty_desc = "moderate" |
|
elif state.difficulty > 0.6 and state.difficulty <= 0.8: |
|
difficulty_desc = "challenging" |
|
elif state.difficulty > 0.8: |
|
difficulty_desc = "very challenging" |
|
|
|
base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
|
|
|
return base_desc |
|
|
|
def _generate_focused_description(self, state: ProblemState, issues: List[Dict[str, Any]]) -> str: |
|
""" |
|
Generate a description focused on remaining issues. |
|
|
|
Args: |
|
state: The problem state |
|
issues: The identified issues |
|
|
|
Returns: |
|
A descriptive prompt focused on remaining issues |
|
""" |
|
base_desc = self._generate_description(state) |
|
|
|
|
|
if issues: |
|
base_desc += "\n\nFocus on the following issues:" |
|
for i, issue in enumerate(issues): |
|
if issue["type"] == "test_failure": |
|
base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
|
else: |
|
base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
|
|
|
|
|
if "focus_areas" in state.code_context: |
|
areas = state.code_context["focus_areas"] |
|
if areas: |
|
base_desc += f"\n\nPay particular attention to: {', '.join(areas)}." |
|
|
|
return base_desc |
|
|
|
def _generate_guided_description( |
|
self, |
|
state: ProblemState, |
|
issues: List[Dict[str, Any]], |
|
suggestions: List[Dict[str, Any]] |
|
) -> str: |
|
""" |
|
Generate a description with added guidance. |
|
|
|
Args: |
|
state: The problem state |
|
issues: The identified issues |
|
suggestions: The suggested fixes |
|
|
|
Returns: |
|
A descriptive prompt with added guidance |
|
""" |
|
base_desc = self._generate_description(state) |
|
|
|
|
|
if issues: |
|
base_desc += "\n\nThe following issues were identified in your previous solution:" |
|
for i, issue in enumerate(issues): |
|
if issue["type"] == "test_failure": |
|
base_desc += f"\n{i+1}. Test failure in '{issue['test']}': {issue['message']}" |
|
if "expected" in issue and "actual" in issue: |
|
base_desc += f"\n Expected: {issue['expected']}" |
|
base_desc += f"\n Actual: {issue['actual']}" |
|
else: |
|
base_desc += f"\n{i+1}. {issue['error_type']} error: {issue['message']}" |
|
if "location" in issue: |
|
base_desc += f"\n Location: {issue['location']}" |
|
|
|
|
|
if suggestions: |
|
base_desc += "\n\nConsider the following suggestions:" |
|
for i, suggestion in enumerate(suggestions): |
|
base_desc += f"\n{i+1}. {suggestion['message']}" |
|
|
|
|
|
if "hints" in state.code_context: |
|
hints = state.code_context["hints"] |
|
if hints: |
|
base_desc += "\n\nHints:" |
|
for i, hint in enumerate(hints): |
|
base_desc += f"\n{i+1}. {hint}" |
|
|
|
return base_desc |
|
|
|
def _generate_hints( |
|
self, |
|
solution: str, |
|
result: EvaluationResult, |
|
feedback: Feedback |
|
) -> List[str]: |
|
""" |
|
Generate hints based on the solution and feedback. |
|
|
|
Args: |
|
solution: The solution code |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
List of hints |
|
""" |
|
hints = [] |
|
|
|
|
|
if result.test_results: |
|
failing_tests = [ |
|
test_name for test_name, test_result in result.test_results.items() |
|
if not test_result["passed"] |
|
] |
|
|
|
if failing_tests: |
|
test_hint = "Focus on fixing the failing tests" |
|
|
|
|
|
for test_name in failing_tests[:2]: |
|
test_result = result.test_results[test_name] |
|
if "expected" in test_result and "actual" in test_result: |
|
test_hint += f". For test '{test_name}', expected '{test_result['expected']}' but got '{test_result['actual']}'" |
|
|
|
hints.append(test_hint + ".") |
|
|
|
|
|
if result.error_details: |
|
for error_type, error_info in result.error_details.items(): |
|
hints.append(f"Fix the {error_type} error: {error_info.get('message', 'Unknown error')}.") |
|
|
|
|
|
for area in feedback.focus_areas: |
|
if area == "syntax": |
|
hints.append("Check your syntax carefully, especially parentheses, indentation, and function definitions.") |
|
elif area == "logic": |
|
hints.append("Review the logic of your solution, especially conditional statements and loop conditions.") |
|
elif area == "functionality": |
|
hints.append("Ensure your solution implements all required functionality specified in the tests.") |
|
elif area == "performance": |
|
hints.append("Consider optimizing your solution for better performance, avoid unnecessary operations.") |
|
elif area == "complexity": |
|
hints.append("Try to simplify your solution, it may be more complex than necessary.") |
|
|
|
return hints |
|
|
|
def _generate_test_hint(self, test_name: str, test_result: Dict[str, Any]) -> str: |
|
""" |
|
Generate a hint for a specific failing test. |
|
|
|
Args: |
|
test_name: The name of the test |
|
test_result: The test result |
|
|
|
Returns: |
|
A hint for the test |
|
""" |
|
if "expected" in test_result and "actual" in test_result: |
|
return f"The test expected '{test_result['expected']}' but got '{test_result['actual']}'" |
|
elif "message" in test_result: |
|
return test_result["message"] |
|
else: |
|
return "The test failed, but no detailed information is available." |
|
|
|
def _add_syntax_error(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Add a syntax error to the solution code. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
lines = solution.split('\n') |
|
if not lines: |
|
return |
|
|
|
|
|
idx = random.randint(0, len(lines) - 1) |
|
line = lines[idx] |
|
|
|
|
|
while not line.strip() or line.strip().startswith('#'): |
|
idx = random.randint(0, len(lines) - 1) |
|
line = lines[idx] |
|
|
|
|
|
mod_type = random.choice([ |
|
"remove_character", |
|
"add_character", |
|
"swap_characters", |
|
"change_indent" |
|
]) |
|
|
|
if mod_type == "remove_character" and line: |
|
char_idx = random.randint(0, len(line) - 1) |
|
lines[idx] = line[:char_idx] + line[char_idx+1:] |
|
|
|
elif mod_type == "add_character": |
|
char_idx = random.randint(0, len(line)) |
|
char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
|
lines[idx] = line[:char_idx] + char + line[char_idx:] |
|
|
|
elif mod_type == "swap_characters" and len(line) >= 2: |
|
char_idx = random.randint(0, len(line) - 2) |
|
lines[idx] = (line[:char_idx] + line[char_idx+1] + |
|
line[char_idx] + line[char_idx+2:]) |
|
|
|
elif mod_type == "change_indent": |
|
|
|
if line.startswith(" "): |
|
lines[idx] = line[2:] |
|
else: |
|
lines[idx] = " " + line |
|
|
|
|
|
modified_code = '\n'.join(lines) |
|
state.code_context["code"] = modified_code |
|
|
|
|
|
if "bugs" not in state.code_context: |
|
state.code_context["bugs"] = [] |
|
|
|
state.code_context["bugs"].append({ |
|
"type": "syntax", |
|
"line": idx + 1, |
|
"description": f"Syntax error introduced in line {idx + 1}" |
|
}) |
|
|
|
def _add_logical_error(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
|
""" |
|
Add a logical error to the solution code. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
parsed_solution: The parsed AST of the solution |
|
""" |
|
modification_types = [ |
|
"change_comparison", |
|
"invert_condition", |
|
"off_by_one", |
|
"change_operator", |
|
"reverse_logic" |
|
] |
|
|
|
mod_type = random.choice(modification_types) |
|
lines = solution.split('\n') |
|
|
|
|
|
if_statements = [] |
|
for i, line in enumerate(lines): |
|
if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
|
if_statements.append((i, line)) |
|
|
|
if if_statements: |
|
|
|
idx, line = random.choice(if_ |
|
|
|
|
|
if if_statements: |
|
|
|
idx, line = random.choice(if_statements) |
|
|
|
if mod_type == "change_comparison": |
|
|
|
comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
|
for op, new_op in comparisons.items(): |
|
if op in line: |
|
lines[idx] = line.replace(op, new_op, 1) |
|
break |
|
|
|
elif mod_type == "invert_condition": |
|
|
|
if "not" in line: |
|
lines[idx] = line.replace("not ", "", 1) |
|
else: |
|
match = re.search(r'(if|while)\s+([^:]+):', line) |
|
if match: |
|
condition = match.group(2) |
|
lines[idx] = line.replace(condition, f"not ({condition})", 1) |
|
|
|
elif mod_type == "off_by_one": |
|
|
|
for op in ["+", "-"]: |
|
if op in line: |
|
|
|
match = re.search(f'\\{op}\\s*(\\d+)', line) |
|
if match: |
|
num = int(match.group(1)) |
|
new_num = num + 1 if op == "+" else max(0, num - 1) |
|
lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
|
break |
|
|
|
elif mod_type == "change_operator": |
|
|
|
operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
|
for op, new_op in operators.items(): |
|
if f" {op} " in line: |
|
lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
|
break |
|
|
|
elif mod_type == "reverse_logic": |
|
|
|
if " and " in line: |
|
parts = line.split(" and ") |
|
lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
|
elif " or " in line: |
|
parts = line.split(" or ") |
|
lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
|
|
|
else: |
|
|
|
|
|
assignments = [] |
|
for i, line in enumerate(lines): |
|
if "=" in line and "==" not in line and "!=" not in line: |
|
assignments.append((i, line)) |
|
|
|
if assignments: |
|
|
|
idx, line = random.choice(assignments) |
|
|
|
|
|
if "+" in line: |
|
lines[idx] = line.replace("+", "-", 1) |
|
elif "-" in line: |
|
lines[idx] = line.replace("-", "+", 1) |
|
elif "*" in line: |
|
lines[idx] = line.replace("*", "/", 1) |
|
elif "/" in line: |
|
lines[idx] = line.replace("/", "*", 1) |
|
else: |
|
|
|
match = re.search(r'=\s*(\d+)', line) |
|
if match: |
|
num = int(match.group(1)) |
|
new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
|
lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
|
|
|
|
|
modified_code = '\n'.join(lines) |
|
state.code_context["code"] = modified_code |
|
|
|
|
|
if "bugs" not in state.code_context: |
|
state.code_context["bugs"] = [] |
|
|
|
state.code_context["bugs"].append({ |
|
"type": "logical", |
|
"line": idx + 1, |
|
"description": f"Logical error introduced in line {idx + 1}: {mod_type}" |
|
}) |
|
|
|
def _add_performance_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
|
""" |
|
Add a performance issue to the solution code. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
parsed_solution: The parsed AST of the solution |
|
""" |
|
lines = solution.split('\n') |
|
|
|
|
|
loops = [] |
|
for i, line in enumerate(lines): |
|
if re.search(r'\bfor\b|\bwhile\b', line): |
|
loops.append((i, line)) |
|
|
|
if loops: |
|
|
|
idx, line = random.choice(loops) |
|
|
|
|
|
mod_type = random.choice([ |
|
"add_nested_loop", |
|
"replace_efficient_operation", |
|
"add_redundant_computation" |
|
]) |
|
|
|
if mod_type == "add_nested_loop": |
|
|
|
indent = len(line) - len(line.lstrip()) |
|
indent_str = ' ' * indent |
|
loop_body_indent = indent_str + ' ' |
|
|
|
|
|
end_idx = idx + 1 |
|
while end_idx < len(lines) and (not lines[end_idx].strip() or len(lines[end_idx]) - len(lines[end_idx].lstrip()) > indent): |
|
end_idx += 1 |
|
|
|
|
|
insert_pos = end_idx |
|
lines.insert(insert_pos, f"{loop_body_indent}for _ in range(100): # Unnecessary loop") |
|
lines.insert(insert_pos + 1, f"{loop_body_indent} pass") |
|
|
|
elif mod_type == "replace_efficient_operation": |
|
|
|
|
|
for i in range(idx + 1, min(idx + 10, len(lines))): |
|
if "append" in lines[i] or "extend" in lines[i]: |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
match = re.search(r'(\w+)\.(append|extend)', lines[i]) |
|
if match: |
|
list_name = match.group(1) |
|
operation = match.group(2) |
|
item = lines[i].split(f"{list_name}.{operation}(")[1].split(")")[0] |
|
|
|
if operation == "append": |
|
|
|
lines[i] = f"{indent_str}{list_name} = {list_name} + [{item}] # Less efficient than append" |
|
elif operation == "extend": |
|
|
|
lines[i] = f"{indent_str}{list_name} = {list_name} + {item} # Less efficient than extend" |
|
break |
|
|
|
elif mod_type == "add_redundant_computation": |
|
|
|
|
|
if idx + 1 < len(lines): |
|
body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
|
body_indent_str = ' ' * body_indent |
|
|
|
|
|
lines.insert(idx + 1, f"{body_indent_str}temp = [] # Redundant computation") |
|
lines.insert(idx + 2, f"{body_indent_str}for i in range(1000):") |
|
lines.insert(idx + 3, f"{body_indent_str} temp.append(i)") |
|
lines.insert(idx + 4, f"{body_indent_str} temp.sort() # Unnecessary sort in each iteration") |
|
|
|
else: |
|
|
|
function_defs = [] |
|
for i, line in enumerate(lines): |
|
if line.strip().startswith("def "): |
|
function_defs.append((i, line)) |
|
|
|
if function_defs: |
|
|
|
idx, line = random.choice(function_defs) |
|
|
|
|
|
if idx + 1 < len(lines): |
|
body_indent = len(lines[idx + 1]) - len(lines[idx + 1].lstrip()) |
|
body_indent_str = ' ' * body_indent |
|
|
|
|
|
lines.insert(idx + 1, f"{body_indent_str}# Inefficient data structure usage") |
|
lines.insert(idx + 2, f"{body_indent_str}data = []") |
|
lines.insert(idx + 3, f"{body_indent_str}for i in range(1000):") |
|
lines.insert(idx + 4, f"{body_indent_str} data.append(i)") |
|
lines.insert(idx + 5, f"{body_indent_str} # Inefficient search operation") |
|
lines.insert(idx + 6, f"{body_indent_str} if i in data: # Linear search instead of using a set") |
|
lines.insert(idx + 7, f"{body_indent_str} pass") |
|
|
|
|
|
modified_code = '\n'.join(lines) |
|
state.code_context["code"] = modified_code |
|
|
|
|
|
if "bugs" not in state.code_context: |
|
state.code_context["bugs"] = [] |
|
|
|
state.code_context["bugs"].append({ |
|
"type": "performance", |
|
"line": idx + 1, |
|
"description": f"Performance issue introduced around line {idx + 1}" |
|
}) |
|
|
|
def _add_edge_case_issue(self, state: ProblemState, solution: str, parsed_solution: ast.Module) -> None: |
|
""" |
|
Add an edge case issue to the solution code. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
parsed_solution: The parsed AST of the solution |
|
""" |
|
lines = solution.split('\n') |
|
|
|
|
|
functions = [] |
|
current_func = None |
|
func_start = None |
|
for i, line in enumerate(lines): |
|
if line.strip().startswith("def "): |
|
if current_func: |
|
functions.append((func_start, i - 1, current_func)) |
|
current_func = line.strip()[4:].split("(")[0] |
|
func_start = i |
|
elif i == len(lines) - 1 and current_func: |
|
functions.append((func_start, i, current_func)) |
|
|
|
if functions: |
|
|
|
start_idx, end_idx, func_name = random.choice(functions) |
|
|
|
|
|
mod_type = random.choice([ |
|
"remove_boundary_check", |
|
"introduce_zero_division", |
|
"handling_empty_input", |
|
"type_assumption" |
|
]) |
|
|
|
if mod_type == "remove_boundary_check": |
|
|
|
for i in range(start_idx, end_idx + 1): |
|
if re.search(r'if\s+.*(?:len|count|size|length|empty|<=|>=|<|>|\!=)', lines[i]): |
|
|
|
lines[i] = f"# {lines[i]} # Boundary check removed" |
|
|
|
j = i + 1 |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
body_indent = indent + 4 |
|
while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
|
lines[j] = f"# {lines[j]}" |
|
j += 1 |
|
break |
|
|
|
elif mod_type == "introduce_zero_division": |
|
|
|
for i in range(start_idx, end_idx + 1): |
|
if "/" in lines[i] and "try" not in lines[i] and "except" not in lines[i]: |
|
|
|
if re.search(r'if\s+.*(?:!=\s*0|>\s*0)', lines[i]): |
|
lines[i] = f"# {lines[i]} # Denominator check removed" |
|
else: |
|
|
|
match = re.search(r'(\w+)\s*/\s*(\w+)', lines[i]) |
|
if match: |
|
denominator = match.group(2) |
|
|
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
lines.insert(i, f"{indent_str}if random.random() < 0.1: # Introduce potential zero division") |
|
lines.insert(i + 1, f"{indent_str} {denominator} = 0") |
|
break |
|
|
|
elif mod_type == "handling_empty_input": |
|
|
|
params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
|
if params and params.group(1): |
|
param_list = [p.strip() for p in params.group(1).split(",")] |
|
if param_list: |
|
param = param_list[0].split("=")[0].strip() |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if re.search(rf'if\s+.*(?:not\s+{param}|len\s*\(\s*{param}\s*\)\s*==\s*0)', lines[i]): |
|
|
|
lines[i] = f"# {lines[i]} # Empty input check removed" |
|
|
|
j = i + 1 |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
body_indent = indent + 4 |
|
while j <= end_idx and (not lines[j].strip() or len(lines[j]) - len(lines[j].lstrip()) >= body_indent): |
|
lines[j] = f"# {lines[j]}" |
|
j += 1 |
|
break |
|
|
|
elif mod_type == "type_assumption": |
|
|
|
params = re.search(r'def\s+\w+\s*\((.*?)\)', lines[start_idx]) |
|
if params and params.group(1): |
|
param_list = [p.strip() for p in params.group(1).split(",")] |
|
if param_list: |
|
param = param_list[0].split("=")[0].strip() |
|
|
|
type_check_found = False |
|
for i in range(start_idx + 1, end_idx + 1): |
|
if re.search(rf'(?:isinstance|type)\s*\(\s*{param}\s*,', lines[i]): |
|
|
|
lines[i] = f"# {lines[i]} # Type check removed" |
|
type_check_found = True |
|
break |
|
|
|
if not type_check_found: |
|
|
|
indent = 4 |
|
for i in range(start_idx + 1, min(start_idx + 5, end_idx + 1)): |
|
if lines[i].strip(): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
break |
|
|
|
indent_str = ' ' * indent |
|
|
|
lines.insert(start_idx + 1, f"{indent_str}# Assuming {param} is a specific type without checking") |
|
lines.insert(start_idx + 2, f"{indent_str}{param}_length = len({param}) # Will fail if {param} doesn't support len()") |
|
|
|
|
|
modified_code = '\n'.join(lines) |
|
state.code_context["code"] = modified_code |
|
|
|
|
|
if "bugs" not in state.code_context: |
|
state.code_context["bugs"] = [] |
|
|
|
state.code_context["bugs"].append({ |
|
"type": "edge_case", |
|
"line": start_idx + 1, |
|
"description": f"Edge case issue introduced in function '{func_name}': {mod_type}" |
|
}) |
|
|
|
def _generate_new_requirement(self, state: ProblemState, solution: str) -> Dict[str, Any]: |
|
""" |
|
Generate a new requirement based on the current state and solution. |
|
|
|
Args: |
|
state: The current problem state |
|
solution: The current solution |
|
|
|
Returns: |
|
A new requirement dictionary |
|
""" |
|
|
|
function_names = re.findall(r'def\s+(\w+)', solution) |
|
variable_names = re.findall(r'(\w+)\s*=', solution) |
|
|
|
|
|
req_type = random.choice([ |
|
"edge_case_handling", |
|
"performance_improvement", |
|
"error_handling", |
|
"type_checking", |
|
"feature_addition" |
|
]) |
|
|
|
if req_type == "edge_case_handling": |
|
if function_names: |
|
func_name = random.choice(function_names) |
|
edge_cases = [ |
|
"empty input", |
|
"negative values", |
|
"zero values", |
|
"extremely large values", |
|
"special characters", |
|
"duplicate values" |
|
] |
|
edge_case = random.choice(edge_cases) |
|
return { |
|
"type": "edge_case_handling", |
|
"description": f"The function '{func_name}' should handle {edge_case} correctly.", |
|
"difficulty": random.uniform(0.3, 0.7) |
|
} |
|
|
|
elif req_type == "performance_improvement": |
|
return { |
|
"type": "performance_improvement", |
|
"description": "The solution should be optimized to run in O(n) time or better.", |
|
"difficulty": random.uniform(0.4, 0.8) |
|
} |
|
|
|
elif req_type == "error_handling": |
|
error_types = [ |
|
"invalid input", |
|
"division by zero", |
|
"file not found", |
|
"network timeout", |
|
"permission denied" |
|
] |
|
error_type = random.choice(error_types) |
|
return { |
|
"type": "error_handling", |
|
"description": f"The code should handle {error_type} errors gracefully.", |
|
"difficulty": random.uniform(0.2, 0.6) |
|
} |
|
|
|
elif req_type == "type_checking": |
|
if function_names: |
|
func_name = random.choice(function_names) |
|
return { |
|
"type": "type_checking", |
|
"description": f"The function '{func_name}' should validate input types before processing.", |
|
"difficulty": random.uniform(0.1, 0.5) |
|
} |
|
|
|
elif req_type == "feature_addition": |
|
features = [ |
|
"logging capability", |
|
"progress tracking", |
|
"caching for repeated operations", |
|
"parameter validation", |
|
"configuration options" |
|
] |
|
feature = random.choice(features) |
|
return { |
|
"type": "feature_addition", |
|
"description": f"Add {feature} to the solution.", |
|
"difficulty": random.uniform(0.3, 0.7) |
|
} |
|
|
|
|
|
return { |
|
"type": "general_improvement", |
|
"description": "Improve the overall code quality and readability.", |
|
"difficulty": random.uniform(0.1, 0.4) |
|
} |
|
|
|
def _modify_requirement(self, requirement: Dict[str, Any], state: ProblemState, solution: str) -> Dict[str, Any]: |
|
""" |
|
Modify an existing requirement to make it more challenging. |
|
|
|
Args: |
|
requirement: The requirement to modify |
|
state: The current problem state |
|
solution: The current solution |
|
|
|
Returns: |
|
The modified requirement |
|
""" |
|
|
|
modified_req = copy.deepcopy(requirement) |
|
|
|
|
|
modified_req["difficulty"] = min(1.0, requirement.get("difficulty", 0.3) + random.uniform(0.1, 0.3)) |
|
|
|
|
|
if requirement["type"] == "edge_case_handling": |
|
modified_req["description"] += " Additionally, it should handle very large inputs efficiently." |
|
|
|
elif requirement["type"] == "performance_improvement": |
|
modified_req["description"] = modified_req["description"].replace("O(n)", "O(log n)") |
|
|
|
elif requirement["type"] == "error_handling": |
|
modified_req["description"] += " And provide detailed error messages for debugging." |
|
|
|
elif requirement["type"] == "type_checking": |
|
modified_req["description"] += " And automatically convert types when possible." |
|
|
|
elif requirement["type"] == "feature_addition": |
|
modified_req["description"] += " Ensure this feature is configurable via parameters." |
|
|
|
else: |
|
modified_req["description"] += " The code should also be well-documented with comments." |
|
|
|
return modified_req |
|
|
|
def _add_edge_case_requirement(self, state: ProblemState) -> None: |
|
""" |
|
Add a requirement for handling edge cases. |
|
|
|
Args: |
|
state: The problem state to modify |
|
""" |
|
edge_cases = [ |
|
"empty collections", |
|
"null/None values", |
|
"boundary values (min/max)", |
|
"negative numbers", |
|
"special characters", |
|
"Unicode characters", |
|
"very large inputs", |
|
"malformed input" |
|
] |
|
|
|
edge_case = random.choice(edge_cases) |
|
|
|
|
|
state.requirements.append({ |
|
"type": "edge_case_handling", |
|
"description": f"The solution must correctly handle {edge_case}.", |
|
"difficulty": random.uniform(0.3, 0.7) |
|
}) |
|
|
|
|
|
if "tests" in state.code_context: |
|
|
|
test_template = self._generate_edge_case_test(edge_case, state.code_context) |
|
if test_template: |
|
state.code_context["tests"].append({ |
|
"name": f"test_edge_case_{len(state.code_context['tests'])}", |
|
"content": test_template, |
|
"description": f"Test handling of {edge_case}" |
|
}) |
|
|
|
def _increase_data_volume(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Modify the problem to require handling larger data volumes. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
state.requirements.append({ |
|
"type": "scalability", |
|
"description": "The solution must efficiently handle large datasets (10,000+ items).", |
|
"difficulty": random.uniform(0.5, 0.8) |
|
}) |
|
|
|
|
|
if "tests" in state.code_context: |
|
for i, test in enumerate(state.code_context["tests"]): |
|
content = test["content"] |
|
|
|
|
|
for pattern, replacement in [ |
|
(r'\[[^\]]{0,50}\]', '[random.randint(0, 1000) for _ in range(10000)]'), |
|
(r'range\(\d+\)', 'range(10000)'), |
|
(r'"[^"]{0,20}"', '"' + 'a' * 10000 + '"') |
|
]: |
|
match = re.search(pattern, content) |
|
if match and random.random() < 0.3: |
|
content = content.replace(match.group(0), replacement, 1) |
|
break |
|
|
|
state.code_context["tests"][i]["content"] = content |
|
state.code_context["tests"][i]["description"] = f"{test.get('description', 'Test')} (with large data)" |
|
|
|
def _add_performance_constraint(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Add a performance constraint to the problem. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
constraints = [ |
|
"linear time complexity (O(n))", |
|
"logarithmic time complexity (O(log n))", |
|
"constant memory usage (O(1) space)", |
|
"execution time under 100ms for large inputs", |
|
"minimal function calls" |
|
] |
|
|
|
constraint = random.choice(constraints) |
|
|
|
|
|
state.requirements.append({ |
|
"type": "performance", |
|
"description": f"The solution must achieve {constraint}.", |
|
"difficulty": random.uniform(0.6, 0.9) |
|
}) |
|
|
|
|
|
if "tests" in state.code_context: |
|
|
|
perf_test = self._generate_performance_test(constraint, state.code_context) |
|
if perf_test: |
|
state.code_context["tests"].append({ |
|
"name": f"test_performance_{len(state.code_context['tests'])}", |
|
"content": perf_test, |
|
"description": f"Test {constraint}" |
|
}) |
|
|
|
def _expand_functionality(self, state: ProblemState, solution: str) -> None: |
|
""" |
|
Expand the required functionality of the solution. |
|
|
|
Args: |
|
state: The problem state to modify |
|
solution: The current solution |
|
""" |
|
|
|
expansions = [ |
|
"support for different input types", |
|
"parameterized behavior", |
|
"additional output formats", |
|
"flexible error handling", |
|
"integration with external systems" |
|
] |
|
|
|
expansion = random.choice(expansions) |
|
|
|
|
|
state.requirements.append({ |
|
"type": "functionality", |
|
"description": f"Expand the solution to include {expansion}.", |
|
"difficulty": random.uniform(0.4, 0.8) |
|
}) |
|
|
|
|
|
if "tests" in state.code_context: |
|
|
|
test_template = self._generate_functionality_test(expansion, state.code_context) |
|
if test_template: |
|
state.code_context["tests"].append({ |
|
"name": f"test_expanded_functionality_{len(state.code_context['tests'])}", |
|
"content": test_template, |
|
"description": f"Test {expansion}" |
|
}) |
|
|
|
def _generate_default_test(self) -> str: |
|
""" |
|
Generate a default test based on the current problem state. |
|
|
|
Returns: |
|
A default test script |
|
""" |
|
|
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
# Add the directory containing the solution to the path |
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
|
# Import the solution |
|
from solution import * |
|
|
|
class DefaultTest(unittest.TestCase): |
|
def test_basic_functionality(self): |
|
# A basic test that should pass if the solution is correct |
|
self.assertTrue(True, "Basic assertion failed") |
|
|
|
def test_expected_output(self): |
|
# Test expected output of main functions |
|
# This will need to be updated based on the specific problem |
|
pass |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
|
|
def _generate_edge_case_test(self, edge_case: str, code_context: Dict[str, Any]) -> str: |
|
""" |
|
Generate a test for an edge case. |
|
|
|
Args: |
|
edge_case: The edge case to test |
|
code_context: The code context containing information about the problem |
|
|
|
Returns: |
|
A test script for the edge case |
|
""" |
|
|
|
function_names = [] |
|
if "code" in code_context: |
|
function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
|
|
|
if not function_names: |
|
return None |
|
|
|
|
|
function_name = random.choice(function_names) |
|
|
|
|
|
if edge_case == "empty collections": |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class EmptyCollectionTest(unittest.TestCase): |
|
def test_empty_input(self): |
|
# Test with empty list |
|
result = {function_name}([]) |
|
self.assertIsNotNone(result, "Function should handle empty list") |
|
|
|
# Test with empty string |
|
result = {function_name}("") |
|
self.assertIsNotNone(result, "Function should handle empty string") |
|
|
|
# Test with empty dict |
|
result = {function_name}({{}}) |
|
self.assertIsNotNone(result, "Function should handle empty dict") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif edge_case == "null/None values": |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class NoneValueTest(unittest.TestCase): |
|
def test_none_input(self): |
|
# Test with None as input |
|
result = {function_name}(None) |
|
self.assertIsNotNone(result, "Function should handle None input") |
|
|
|
# Test with list containing None |
|
result = {function_name}([1, None, 3]) |
|
self.assertIsNotNone(result, "Function should handle list with None values") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif edge_case == "boundary values (min/max)": |
|
return f""" |
|
# recursive_swe_bench/task_generators/bug_fixing.py (completion) |
|
|
|
import unittest |
|
import sys |
|
import os |
|
import sys |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class BoundaryValueTest(unittest.TestCase): |
|
def test_min_max_values(self): |
|
# Test with minimum integer |
|
min_int = -sys.maxsize - 1 |
|
result = {function_name}(min_int) |
|
self.assertIsNotNone(result, "Function should handle minimum integer") |
|
|
|
# Test with maximum integer |
|
max_int = sys.maxsize |
|
result = {function_name}(max_int) |
|
self.assertIsNotNone(result, "Function should handle maximum integer") |
|
|
|
# Test with very large list |
|
large_list = list(range(10000)) |
|
result = {function_name}(large_list) |
|
self.assertIsNotNone(result, "Function should handle very large inputs") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif edge_case == "negative numbers": |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class NegativeNumberTest(unittest.TestCase): |
|
def test_negative_numbers(self): |
|
# Test with negative number |
|
result = {function_name}(-1) |
|
self.assertIsNotNone(result, "Function should handle negative numbers") |
|
|
|
# Test with list of negative numbers |
|
result = {function_name}([-1, -2, -3]) |
|
self.assertIsNotNone(result, "Function should handle lists of negative numbers") |
|
|
|
# Test with mixed positive and negative |
|
result = {function_name}([-1, 0, 1]) |
|
self.assertIsNotNone(result, "Function should handle mixed positive and negative") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
else: |
|
|
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class EdgeCaseTest(unittest.TestCase): |
|
def test_edge_case_{edge_case.replace(' ', '_')}(self): |
|
# Test edge case: {edge_case} |
|
# This is a placeholder test that needs to be customized for the specific edge case |
|
self.assertTrue(True, "Edge case test not implemented") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
|
|
def _generate_performance_test(self, constraint: str, code_context: Dict[str, Any]) -> str: |
|
""" |
|
Generate a performance test based on a constraint. |
|
|
|
Args: |
|
constraint: The performance constraint |
|
code_context: The code context containing information about the problem |
|
|
|
Returns: |
|
A test script for the performance constraint |
|
""" |
|
|
|
function_names = [] |
|
if "code" in code_context: |
|
function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
|
|
|
if not function_names: |
|
return None |
|
|
|
|
|
function_name = random.choice(function_names) |
|
|
|
if "time complexity" in constraint: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import time |
|
import random |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class PerformanceTest(unittest.TestCase): |
|
def test_time_complexity(self): |
|
# Test for {constraint} |
|
sizes = [100, 1000, 10000] |
|
times = [] |
|
|
|
for size in sizes: |
|
# Generate input of the given size |
|
input_data = [random.randint(0, 1000) for _ in range(size)] |
|
|
|
# Measure execution time |
|
start_time = time.time() |
|
{function_name}(input_data) |
|
end_time = time.time() |
|
|
|
times.append(end_time - start_time) |
|
|
|
# Check if time grows appropriately |
|
# For O(n), time should grow linearly with input size |
|
# For O(log n), time should grow logarithmically |
|
# This is a simplified check and might need adjustment |
|
if "log n" in "{constraint}": |
|
# For logarithmic time, the ratio of times should decrease |
|
ratio1 = times[1] / times[0] |
|
ratio2 = times[2] / times[1] |
|
self.assertLess(ratio2, ratio1 * 1.5, |
|
f"Growth rate appears super-logarithmic: {times}") |
|
else: # Assume linear or better |
|
# For linear time, the ratio of times should be roughly equal to ratio of sizes |
|
ratio1 = times[1] / times[0] |
|
size_ratio1 = sizes[1] / sizes[0] |
|
|
|
ratio2 = times[2] / times[1] |
|
size_ratio2 = sizes[2] / sizes[1] |
|
|
|
self.assertLess(ratio1, size_ratio1 * 1.5, |
|
f"First growth rate appears super-linear: {times}") |
|
self.assertLess(ratio2, size_ratio2 * 1.5, |
|
f"Second growth rate appears super-linear: {times}") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "execution time" in constraint: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import time |
|
import random |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class PerformanceTest(unittest.TestCase): |
|
def test_execution_time(self): |
|
# Test for {constraint} |
|
# Generate a large input |
|
input_data = [random.randint(0, 1000) for _ in range(10000)] |
|
|
|
# Measure execution time |
|
start_time = time.time() |
|
{function_name}(input_data) |
|
end_time = time.time() |
|
|
|
execution_time = (end_time - start_time) * 1000 # Convert to ms |
|
|
|
self.assertLess(execution_time, 100, |
|
f"Execution time exceeded 100ms: {execution_time:.2f}ms") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "memory usage" in constraint: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import psutil |
|
import random |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class MemoryUsageTest(unittest.TestCase): |
|
def test_memory_usage(self): |
|
# Test for {constraint} |
|
# Note: This is an approximate test and may not be accurate in all environments |
|
|
|
# Get current process |
|
process = psutil.Process(os.getpid()) |
|
|
|
# Measure memory before |
|
memory_before = process.memory_info().rss / 1024 / 1024 # MB |
|
|
|
# Generate a large input |
|
input_data = [random.randint(0, 1000) for _ in range(100000)] |
|
|
|
# Run function |
|
{function_name}(input_data) |
|
|
|
# Measure memory after |
|
memory_after = process.memory_info().rss / 1024 / 1024 # MB |
|
|
|
# Calculate memory usage |
|
memory_used = memory_after - memory_before |
|
|
|
# A crude approximation, adjust as needed |
|
self.assertLess(memory_used, 10, |
|
f"Memory usage seems high: {memory_used:.2f}MB") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
else: |
|
|
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import time |
|
import random |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class PerformanceTest(unittest.TestCase): |
|
def test_performance(self): |
|
# Test for {constraint} |
|
# This is a placeholder test that needs to be customized for the specific constraint |
|
|
|
# Generate a large input |
|
input_data = [random.randint(0, 1000) for _ in range(10000)] |
|
|
|
# Measure execution time |
|
start_time = time.time() |
|
{function_name}(input_data) |
|
end_time = time.time() |
|
|
|
execution_time = end_time - start_time |
|
|
|
# Just log the time for now |
|
print(f"Execution time: {execution_time:.4f} seconds") |
|
self.assertTrue(True, "Performance test completed") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
|
|
def _generate_functionality_test(self, expansion: str, code_context: Dict[str, Any]) -> str: |
|
""" |
|
Generate a test for expanded functionality. |
|
|
|
Args: |
|
expansion: The functionality expansion |
|
code_context: The code context containing information about the problem |
|
|
|
Returns: |
|
A test script for the expanded functionality |
|
""" |
|
|
|
function_names = [] |
|
if "code" in code_context: |
|
function_names = re.findall(r'def\s+(\w+)', code_context["code"]) |
|
|
|
if not function_names: |
|
return None |
|
|
|
|
|
function_name = random.choice(function_names) |
|
|
|
if "different input types" in expansion: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import json |
|
from collections import namedtuple |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class InputTypesTest(unittest.TestCase): |
|
def test_different_input_types(self): |
|
# Test with different types of inputs |
|
|
|
# Test with list |
|
list_input = [1, 2, 3] |
|
list_result = {function_name}(list_input) |
|
self.assertIsNotNone(list_result, "Function should handle list input") |
|
|
|
# Test with tuple |
|
tuple_input = (1, 2, 3) |
|
tuple_result = {function_name}(tuple_input) |
|
self.assertIsNotNone(tuple_result, "Function should handle tuple input") |
|
|
|
# Test with set |
|
set_input = {{1, 2, 3}} |
|
set_result = {function_name}(set_input) |
|
self.assertIsNotNone(set_result, "Function should handle set input") |
|
|
|
# Test with dictionary |
|
dict_input = {{"a": 1, "b": 2, "c": 3}} |
|
dict_result = {function_name}(dict_input) |
|
self.assertIsNotNone(dict_result, "Function should handle dictionary input") |
|
|
|
# Test with JSON string |
|
json_input = '{{"data": [1, 2, 3]}}' |
|
json_result = {function_name}(json_input) |
|
self.assertIsNotNone(json_result, "Function should handle JSON string") |
|
|
|
# Test with custom object |
|
Point = namedtuple('Point', ['x', 'y']) |
|
obj_input = Point(1, 2) |
|
obj_result = {function_name}(obj_input) |
|
self.assertIsNotNone(obj_result, "Function should handle custom object") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "parameterized behavior" in expansion: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class ParameterizedTest(unittest.TestCase): |
|
def test_parameterized_behavior(self): |
|
# Test function with different parameters |
|
|
|
# Base case with default parameters |
|
base_input = [1, 2, 3] |
|
base_result = {function_name}(base_input) |
|
|
|
# The function should now accept additional parameters |
|
# These are example parameters, adjust based on the specific function |
|
|
|
# With sorting parameter |
|
try: |
|
sorted_result = {function_name}(base_input, sort=True) |
|
self.assertIsNotNone(sorted_result, "Function should handle sort parameter") |
|
except TypeError as e: |
|
self.fail(f"Function does not support sort parameter: {{e}}") |
|
|
|
# With filtering parameter |
|
try: |
|
filtered_result = {function_name}(base_input, filter_fn=lambda x: x > 1) |
|
self.assertIsNotNone(filtered_result, "Function should handle filter_fn parameter") |
|
except TypeError as e: |
|
self.fail(f"Function does not support filter_fn parameter: {{e}}") |
|
|
|
# With formatting parameter |
|
try: |
|
formatted_result = {function_name}(base_input, format="json") |
|
self.assertIsNotNone(formatted_result, "Function should handle format parameter") |
|
except TypeError as e: |
|
self.fail(f"Function does not support format parameter: {{e}}") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "additional output formats" in expansion: |
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
import json |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class OutputFormatsTest(unittest.TestCase): |
|
def test_output_formats(self): |
|
# Test function with different output formats |
|
input_data = [1, 2, 3] |
|
|
|
# Original format |
|
original_result = {function_name}(input_data) |
|
|
|
# The function should now support different output formats |
|
# These are example formats, adjust based on the specific function |
|
|
|
# JSON format |
|
try: |
|
json_result = {function_name}(input_data, format="json") |
|
# Check if it's valid JSON |
|
try: |
|
json_obj = json.loads(json_result) if isinstance(json_result, str) else json_result |
|
self.assertIsNotNone(json_obj, "JSON result should be valid") |
|
except json.JSONDecodeError: |
|
self.fail("JSON result is not valid") |
|
except TypeError as e: |
|
self.fail(f"Function does not support JSON format: {{e}}") |
|
|
|
# CSV format |
|
try: |
|
csv_result = {function_name}(input_data, format="csv") |
|
self.assertIsNotNone(csv_result, "CSV result should not be None") |
|
if isinstance(csv_result, str): |
|
self.assertIn(",", csv_result, "CSV result should contain commas") |
|
except TypeError as e: |
|
self.fail(f"Function does not support CSV format: {{e}}") |
|
|
|
# XML format |
|
try: |
|
xml_result = {function_name}(input_data, format="xml") |
|
self.assertIsNotNone(xml_result, "XML result should not be None") |
|
if isinstance(xml_result, str): |
|
self.assertIn("<", xml_result, "XML result should contain tags") |
|
self.assertIn(">", xml_result, "XML result should contain tags") |
|
except TypeError as e: |
|
self.fail(f"Function does not support XML format: {{e}}") |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
else: |
|
|
|
return f""" |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {function_name} |
|
|
|
class ExpandedFunctionalityTest(unittest.TestCase): |
|
def test_expanded_functionality(self): |
|
# Test for {expansion} |
|
# This is a placeholder test that needs to be customized for the specific expansion |
|
|
|
# Basic test to verify the function exists |
|
input_data = [1, 2, 3] |
|
result = {function_name}(input_data) |
|
self.assertIsNotNone(result, "Function should return a result") |
|
|
|
# You need to add specific tests for the expanded functionality |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
|
|
def _calculate_adaptation_vector(self, solution: str, result: EvaluationResult, feedback: Feedback) -> List[float]: |
|
""" |
|
Calculate an adaptation vector based on the solution, result, and feedback. |
|
|
|
The adaptation vector encodes how the problem should evolve in future iterations, |
|
capturing dimensions like difficulty, bug type emphasis, and feedback focus. |
|
|
|
Args: |
|
solution: The current solution |
|
result: The evaluation results |
|
feedback: The feedback provided |
|
|
|
Returns: |
|
An adaptation vector (list of floats) |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adaptation_vector = [0.0] * 5 |
|
|
|
|
|
if result.score > 0.95: |
|
adaptation_vector[0] = 0.2 |
|
elif result.score > 0.8: |
|
adaptation_vector[0] = 0.1 |
|
elif result.score > 0.6: |
|
adaptation_vector[0] = 0.0 |
|
elif result.score > 0.4: |
|
adaptation_vector[0] = -0.1 |
|
else: |
|
adaptation_vector[0] = -0.2 |
|
|
|
|
|
syntax_issues = sum(1 for issue in feedback.issues if issue.get("error_type") == "syntax") |
|
logical_issues = sum(1 for issue in feedback.issues if issue.get("type") == "test_failure") |
|
|
|
if syntax_issues > logical_issues: |
|
adaptation_vector[1] = -0.1 |
|
elif logical_issues > syntax_issues: |
|
adaptation_vector[1] = 0.1 |
|
|
|
|
|
if result.metrics and "execution_time" in result.metrics: |
|
if result.metrics["execution_time"] > self.config.get("performance_threshold", 1.0): |
|
adaptation_vector[2] = 0.2 |
|
else: |
|
adaptation_vector[2] = -0.1 |
|
|
|
|
|
if result.test_results: |
|
edge_case_failures = sum(1 for test_name, test_result in result.test_results.items() |
|
if not test_result["passed"] and "edge" in test_name.lower()) |
|
if edge_case_failures > 0: |
|
adaptation_vector[3] = 0.2 |
|
else: |
|
adaptation_vector[3] = 0.0 |
|
|
|
|
|
current_requirements = len(self.state.requirements) |
|
if current_requirements < 3: |
|
adaptation_vector[4] = 0.1 |
|
elif current_requirements >= 5: |
|
adaptation_vector[4] = -0.1 |
|
|
|
return adaptation_vector |
|
|
|
|
|
class DefaultTestRunner: |
|
"""Default test runner for evaluating bug fixes.""" |
|
|
|
def run_tests(self, solution_file: Path, test_files: List[Path], code_context: Dict[str, Any]) -> Dict[str, Any]: |
|
""" |
|
Run tests against a solution file. |
|
|
|
Args: |
|
solution_file: Path to the solution file |
|
test_files: List of test file paths |
|
code_context: Context information about the code |
|
|
|
Returns: |
|
Dictionary of test results |
|
""" |
|
|
|
results = { |
|
"all_passed": True, |
|
"passed_tests": 0, |
|
"total_tests": 0, |
|
"tests": {}, |
|
"execution": { |
|
"success": True, |
|
"error": None, |
|
"stdout": None, |
|
"stderr": None |
|
}, |
|
"execution_time": 0.0 |
|
} |
|
|
|
|
|
try: |
|
|
|
if not solution_file.exists(): |
|
results["execution"]["success"] = False |
|
results["execution"]["error"] = "Solution file not found" |
|
results["all_passed"] = False |
|
return results |
|
|
|
|
|
sys.path.insert(0, str(solution_file.parent)) |
|
import importlib.util |
|
spec = importlib.util.spec_from_file_location("solution", solution_file) |
|
solution_module = importlib.util.module_from_spec(spec) |
|
spec.loader.exec_module(solution_module) |
|
|
|
|
|
if "required_functions" in code_context: |
|
for func_name in code_context["required_functions"]: |
|
if not hasattr(solution_module, func_name): |
|
results["execution"]["success"] = False |
|
results["execution"]["error"] = f"Required function '{func_name}' not found" |
|
results["all_passed"] = False |
|
return results |
|
|
|
except Exception as e: |
|
results["execution"]["success"] = False |
|
results["execution"]["error"] = str(e) |
|
results["all_passed"] = False |
|
return results |
|
|
|
|
|
for test_file in test_files: |
|
|
|
if not test_file.exists(): |
|
continue |
|
|
|
|
|
import unittest |
|
import io |
|
from contextlib import redirect_stdout, redirect_stderr |
|
|
|
|
|
loader = unittest.TestLoader() |
|
try: |
|
tests = loader.discover(str(test_file.parent), pattern=test_file.name) |
|
|
|
|
|
test_cases = 0 |
|
for suite in tests: |
|
for test_case in suite: |
|
test_cases += test_case.countTestCases() |
|
|
|
results["total_tests"] += test_cases |
|
|
|
|
|
runner = unittest.TextTestRunner(verbosity=2) |
|
|
|
|
|
stdout_buffer = io.StringIO() |
|
stderr_buffer = io.StringIO() |
|
|
|
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
|
test_result = runner.run(tests) |
|
|
|
stdout = stdout_buffer.getvalue() |
|
stderr = stderr_buffer.getvalue() |
|
|
|
|
|
if not test_result.wasSuccessful(): |
|
results["all_passed"] = False |
|
|
|
|
|
passed_tests = test_cases - len(test_result.failures) - len(test_result.errors) |
|
results["passed_tests"] += passed_tests |
|
|
|
|
|
test_name = test_file.stem |
|
results["tests"][test_name] = { |
|
"passed": test_result.wasSuccessful(), |
|
"failures": len(test_result.failures), |
|
"errors": len(test_result.errors), |
|
"skipped": len(test_result.skipped), |
|
"total": test_cases, |
|
"passed_count": passed_tests, |
|
"stdout": stdout, |
|
"stderr": stderr |
|
} |
|
|
|
|
|
for failure in test_result.failures: |
|
test_id = failure[0].id() |
|
failure_message = failure[1] |
|
|
|
|
|
import re |
|
expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
|
actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
|
|
|
expected = expected_match.group(1).strip() if expected_match else None |
|
actual = actual_match.group(1).strip() if actual_match else None |
|
|
|
if test_id not in results["tests"]: |
|
results["tests"][test_id] = {} |
|
|
|
results["tests"][test_id].update({ |
|
"passed": False, |
|
"message": failure_message, |
|
"expected": expected, |
|
"actual": actual |
|
}) |
|
|
|
except Exception as e: |
|
|
|
results["all_passed"] = False |
|
results["tests"][test_file.stem] = { |
|
"passed": False, |
|
"error": str(e), |
|
"failures": 1, |
|
"errors": 1, |
|
"skipped": 0, |
|
"total": 1, |
|
"passed_count": 0 |
|
} |
|
results["total_tests"] += 1 |
|
|
|
return results |
|
|
|
|
|
class BugFixingTaskGenerator: |
|
"""Generator for bug fixing tasks.""" |
|
|
|
def __init__(self, config: Dict[str, Any] = None): |
|
""" |
|
Initialize the bug fixing task generator. |
|
|
|
Args: |
|
config: Configuration options |
|
""" |
|
self.config = config or {} |
|
self.difficulty_levels = self.config.get( |
|
"difficulty_levels", |
|
["easy", "medium", "hard", "expert"] |
|
) |
|
self.bug_categories = self.config.get( |
|
"bug_categories", |
|
[ |
|
BugCategory.SYNTAX, |
|
BugCategory.LOGICAL, |
|
BugCategory.EDGE_CASE, |
|
BugCategory.PERFORMANCE |
|
] |
|
) |
|
self.test_templates = self._load_test_templates() |
|
|
|
def generate_task(self, difficulty: str = None, bug_categories: List[str] = None) -> BugFixingTask: |
|
""" |
|
Generate a new bug fixing task. |
|
|
|
Args: |
|
difficulty: The difficulty level (easy, medium, hard, expert) |
|
bug_categories: List of bug categories to include |
|
|
|
Returns: |
|
A new bug fixing task |
|
""" |
|
|
|
if difficulty is None: |
|
difficulty = random.choice(self.difficulty_levels) |
|
|
|
|
|
if bug_categories is None: |
|
num_categories = random.randint(1, 3) |
|
bug_categories = random.sample(self.bug_categories, num_categories) |
|
|
|
|
|
problem_state = self._generate_problem_state(difficulty, bug_categories) |
|
|
|
|
|
task_config = { |
|
"difficulty": difficulty, |
|
"bug_categories": bug_categories, |
|
"convergence_criteria": { |
|
"score_threshold": 0.95, |
|
"min_iterations": 1, |
|
"max_iterations": self.config.get("max_iterations", 5), |
|
"score_delta_threshold": 0.05, |
|
"consecutive_plateau_limit": 2 |
|
}, |
|
"score_weights": { |
|
"test": 0.7, |
|
"execution": 0.3 |
|
}, |
|
"performance_threshold": 1.0, |
|
"complexity_threshold": 0.7 |
|
} |
|
|
|
|
|
return BugFixingTask(problem_state, task_config) |
|
|
|
def _generate_problem_state(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
|
""" |
|
Generate a problem state for the given difficulty and bug categories. |
|
|
|
Args: |
|
difficulty: The difficulty level |
|
bug_categories: List of bug categories |
|
|
|
Returns: |
|
A problem state for the task |
|
""" |
|
|
|
template = self._choose_template(difficulty, bug_categories) |
|
|
|
|
|
problem_state = copy.deepcopy(template) |
|
|
|
|
|
problem_state.problem_id = str(uuid.uuid4()) |
|
|
|
|
|
problem_state.evolution_stage = 0 |
|
problem_state.adaptation_vector = [0.0] * 5 |
|
|
|
|
|
difficulty_values = { |
|
"easy": 0.25, |
|
"medium": 0.5, |
|
"hard": 0.75, |
|
"expert": 0.9 |
|
} |
|
problem_state.difficulty = difficulty_values.get(difficulty, 0.5) |
|
|
|
|
|
for category in bug_categories: |
|
self._insert_bug(problem_state, category) |
|
|
|
|
|
problem_state.description = self._generate_description(problem_state) |
|
|
|
return problem_state |
|
|
|
def _choose_template(self, difficulty: str, bug_categories: List[str]) -> ProblemState: |
|
""" |
|
Choose a template that matches the difficulty and bug categories. |
|
|
|
Args: |
|
difficulty: The difficulty level |
|
bug_categories: List of bug categories |
|
|
|
Returns: |
|
A template problem state |
|
""" |
|
|
|
|
|
|
|
|
|
code = self._generate_template_code(difficulty, bug_categories) |
|
tests = self._generate_template_tests(code) |
|
|
|
|
|
return ProblemState( |
|
problem_id="template", |
|
description="Fix the bugs in the given code.", |
|
code_context={ |
|
"code": code, |
|
"tests": tests, |
|
"bug_count": 0, |
|
"bug_categories": [] |
|
}, |
|
requirements=[ |
|
{ |
|
"type": "functional", |
|
"description": "The code should pass all the provided tests.", |
|
"difficulty": 0.3 |
|
} |
|
], |
|
difficulty=0.5, |
|
evolution_stage=0, |
|
adaptation_vector=[0.0] * 5 |
|
) |
|
|
|
def _generate_template_code(self, difficulty: str, bug_categories: List[str]) -> str: |
|
""" |
|
Generate template code based on difficulty and bug categories. |
|
|
|
Args: |
|
difficulty: The difficulty level |
|
bug_categories: List of bug categories |
|
|
|
Returns: |
|
Template code |
|
""" |
|
|
|
templates = { |
|
"easy": """ |
|
def calculate_sum(numbers): |
|
\"\"\"Calculate the sum of a list of numbers.\"\"\" |
|
total = 0 |
|
for num in numbers: |
|
total += num |
|
return total |
|
|
|
def calculate_average(numbers): |
|
\"\"\"Calculate the average of a list of numbers.\"\"\" |
|
if not numbers: |
|
return 0 |
|
return calculate_sum(numbers) / len(numbers) |
|
""", |
|
"medium": """ |
|
def find_most_frequent(items): |
|
\"\"\"Find the most frequently occurring item in |
|
# recursive_swe_bench/task_generators/bug_fixing.py (template generation) |
|
|
|
def find_most_frequent(items): |
|
"""Find the most frequently occurring item in a list.""" |
|
if not items: |
|
return None |
|
|
|
counts = {} |
|
for item in items: |
|
if item in counts: |
|
counts[item] += 1 |
|
else: |
|
counts[item] = 1 |
|
|
|
max_count = 0 |
|
max_item = None |
|
for item, count in counts.items(): |
|
if count > max_count: |
|
max_count = count |
|
max_item = item |
|
|
|
return max_item |
|
|
|
def binary_search(sorted_list, target): |
|
"""Perform binary search on a sorted list.""" |
|
left = 0 |
|
right = len(sorted_list) - 1 |
|
|
|
while left <= right: |
|
mid = (left + right) // 2 |
|
if sorted_list[mid] == target: |
|
return mid |
|
elif sorted_list[mid] < target: |
|
left = mid + 1 |
|
else: |
|
right = mid - 1 |
|
|
|
return -1 # Target not found |
|
""", |
|
"hard": """ |
|
def merge_sort(arr): |
|
"""Sort an array using the merge sort algorithm.""" |
|
if len(arr) <= 1: |
|
return arr |
|
|
|
# Split the array into two halves |
|
mid = len(arr) // 2 |
|
left_half = arr[:mid] |
|
right_half = arr[mid:] |
|
|
|
# Recursively sort both halves |
|
left_half = merge_sort(left_half) |
|
right_half = merge_sort(right_half) |
|
|
|
# Merge the sorted halves |
|
return merge(left_half, right_half) |
|
|
|
def merge(left, right): |
|
"""Merge two sorted arrays.""" |
|
result = [] |
|
i = j = 0 |
|
|
|
# Compare elements from both arrays and add the smaller one to the result |
|
while i < len(left) and j < len(right): |
|
if left[i] <= right[j]: |
|
result.append(left[i]) |
|
i += 1 |
|
else: |
|
result.append(right[j]) |
|
j += 1 |
|
|
|
# Add any remaining elements |
|
result.extend(left[i:]) |
|
result.extend(right[j:]) |
|
|
|
return result |
|
|
|
def quicksort(arr): |
|
"""Sort an array using the quicksort algorithm.""" |
|
if len(arr) <= 1: |
|
return arr |
|
|
|
# Choose the pivot (using the first element for simplicity) |
|
pivot = arr[0] |
|
|
|
# Partition the array |
|
less = [x for x in arr[1:] if x <= pivot] |
|
greater = [x for x in arr[1:] if x > pivot] |
|
|
|
# Recursively sort the partitions and combine |
|
return quicksort(less) + [pivot] + quicksort(greater) |
|
""", |
|
"expert": """ |
|
class Node: |
|
"""Node in a binary tree.""" |
|
def __init__(self, value): |
|
self.value = value |
|
self.left = None |
|
self.right = None |
|
|
|
def build_binary_tree(values): |
|
"""Build a binary tree from a list of values.""" |
|
if not values: |
|
return None |
|
|
|
root = Node(values[0]) |
|
queue = [root] |
|
i = 1 |
|
|
|
while queue and i < len(values): |
|
node = queue.pop(0) |
|
|
|
# Add left child |
|
if i < len(values) and values[i] is not None: |
|
node.left = Node(values[i]) |
|
queue.append(node.left) |
|
i += 1 |
|
|
|
# Add right child |
|
if i < len(values) and values[i] is not None: |
|
node.right = Node(values[i]) |
|
queue.append(node.right) |
|
i += 1 |
|
|
|
return root |
|
|
|
def is_balanced(root): |
|
"""Check if a binary tree is balanced.""" |
|
def height(node): |
|
if not node: |
|
return 0 |
|
return max(height(node.left), height(node.right)) + 1 |
|
|
|
def is_balanced_helper(node): |
|
if not node: |
|
return True |
|
|
|
left_height = height(node.left) |
|
right_height = height(node.right) |
|
|
|
if abs(left_height - right_height) > 1: |
|
return False |
|
|
|
return is_balanced_helper(node.left) and is_balanced_helper(node.right) |
|
|
|
return is_balanced_helper(root) |
|
|
|
def find_lca(root, p, q): |
|
"""Find the lowest common ancestor of two nodes in a binary tree.""" |
|
if not root: |
|
return None |
|
|
|
if root.value == p or root.value == q: |
|
return root |
|
|
|
left_lca = find_lca(root.left, p, q) |
|
right_lca = find_lca(root.right, p, q) |
|
|
|
if left_lca and right_lca: |
|
return root |
|
|
|
return left_lca if left_lca else right_lca |
|
""" |
|
} |
|
|
|
|
|
if difficulty in templates: |
|
return templates[difficulty] |
|
else: |
|
return templates["medium"] |
|
|
|
def _generate_template_tests(self, code: str) -> List[Dict[str, Any]]: |
|
""" |
|
Generate template tests based on the code. |
|
|
|
Args: |
|
code: The template code |
|
|
|
Returns: |
|
List of test dictionaries |
|
""" |
|
|
|
function_names = re.findall(r'def\s+(\w+)', code) |
|
|
|
|
|
tests = [] |
|
for func_name in function_names: |
|
test_content = self._generate_test_for_function(func_name) |
|
if test_content: |
|
tests.append({ |
|
"name": f"test_{func_name}", |
|
"content": test_content, |
|
"description": f"Test for {func_name} function" |
|
}) |
|
|
|
return tests |
|
|
|
def _generate_test_for_function(self, func_name: str) -> str: |
|
""" |
|
Generate a test for a specific function. |
|
|
|
Args: |
|
func_name: The name of the function to test |
|
|
|
Returns: |
|
Test content |
|
""" |
|
|
|
if func_name in self.test_templates: |
|
return self.test_templates[func_name] |
|
|
|
|
|
if "sum" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import calculate_sum |
|
|
|
class TestCalculateSum(unittest.TestCase): |
|
def test_calculate_sum(self): |
|
self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
|
self.assertEqual(calculate_sum([]), 0) |
|
self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "average" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import calculate_average |
|
|
|
class TestCalculateAverage(unittest.TestCase): |
|
def test_calculate_average(self): |
|
self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
|
self.assertEqual(calculate_average([]), 0) |
|
self.assertEqual(calculate_average([10]), 10) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "frequent" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import find_most_frequent |
|
|
|
class TestFindMostFrequent(unittest.TestCase): |
|
def test_find_most_frequent(self): |
|
self.assertEqual(find_most_frequent([1, 2, 2, 3, 3, 3, 4]), 3) |
|
self.assertEqual(find_most_frequent(['a', 'b', 'a', 'c', 'a']), 'a') |
|
self.assertIsNone(find_most_frequent([])) |
|
self.assertEqual(find_most_frequent([5]), 5) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "search" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import binary_search |
|
|
|
class TestBinarySearch(unittest.TestCase): |
|
def test_binary_search(self): |
|
self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2) |
|
self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0) |
|
self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4) |
|
self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1) |
|
self.assertEqual(binary_search([], 5), -1) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "sort" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {0} |
|
|
|
class Test{1}(unittest.TestCase): |
|
def test_sorting(self): |
|
self.assertEqual({0}([]), []) |
|
self.assertEqual({0}([1]), [1]) |
|
self.assertEqual({0}([3, 1, 4, 1, 5, 9, 2, 6, 5]), [1, 1, 2, 3, 4, 5, 5, 6, 9]) |
|
self.assertEqual({0}([9, 8, 7, 6, 5, 4, 3, 2, 1]), [1, 2, 3, 4, 5, 6, 7, 8, 9]) |
|
self.assertEqual({0}([1, 1, 1, 1]), [1, 1, 1, 1]) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""".format(func_name, func_name.title()) |
|
elif "balanced" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import Node, is_balanced |
|
|
|
class TestIsBalanced(unittest.TestCase): |
|
def test_is_balanced(self): |
|
# Create a balanced tree |
|
# 1 |
|
# / \\ |
|
# 2 3 |
|
# / \\ / \\ |
|
# 4 5 6 7 |
|
root = Node(1) |
|
root.left = Node(2) |
|
root.right = Node(3) |
|
root.left.left = Node(4) |
|
root.left.right = Node(5) |
|
root.right.left = Node(6) |
|
root.right.right = Node(7) |
|
self.assertTrue(is_balanced(root)) |
|
|
|
# Create an unbalanced tree |
|
# 1 |
|
# / \\ |
|
# 2 3 |
|
# / \\ |
|
# 4 5 |
|
#/ |
|
#6 |
|
root = Node(1) |
|
root.left = Node(2) |
|
root.right = Node(3) |
|
root.left.left = Node(4) |
|
root.left.right = Node(5) |
|
root.left.left.left = Node(6) |
|
self.assertFalse(is_balanced(root)) |
|
|
|
# Empty tree is balanced |
|
self.assertTrue(is_balanced(None)) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "lca" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import Node, find_lca |
|
|
|
class TestFindLCA(unittest.TestCase): |
|
def test_find_lca(self): |
|
# Create a tree |
|
# 1 |
|
# / \\ |
|
# 2 3 |
|
# / \\ / \\ |
|
# 4 5 6 7 |
|
root = Node(1) |
|
root.left = Node(2) |
|
root.right = Node(3) |
|
root.left.left = Node(4) |
|
root.left.right = Node(5) |
|
root.right.left = Node(6) |
|
root.right.right = Node(7) |
|
|
|
# Test cases |
|
self.assertEqual(find_lca(root, 4, 5).value, 2) # LCA of 4 and 5 is 2 |
|
self.assertEqual(find_lca(root, 4, 6).value, 1) # LCA of 4 and 6 is 1 |
|
self.assertEqual(find_lca(root, 3, 7).value, 3) # LCA of 3 and 7 is 3 |
|
self.assertEqual(find_lca(root, 2, 7).value, 1) # LCA of 2 and 7 is 1 |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
elif "tree" in func_name.lower(): |
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import Node, build_binary_tree |
|
|
|
class TestBuildBinaryTree(unittest.TestCase): |
|
def test_build_binary_tree(self): |
|
# Test empty list |
|
self.assertIsNone(build_binary_tree([])) |
|
|
|
# Test single node |
|
root = build_binary_tree([1]) |
|
self.assertEqual(root.value, 1) |
|
self.assertIsNone(root.left) |
|
self.assertIsNone(root.right) |
|
|
|
# Test complete tree |
|
# 1 |
|
# / \\ |
|
# 2 3 |
|
# / \\ / \\ |
|
# 4 5 6 7 |
|
values = [1, 2, 3, 4, 5, 6, 7] |
|
root = build_binary_tree(values) |
|
self.assertEqual(root.value, 1) |
|
self.assertEqual(root.left.value, 2) |
|
self.assertEqual(root.right.value, 3) |
|
self.assertEqual(root.left.left.value, 4) |
|
self.assertEqual(root.left.right.value, 5) |
|
self.assertEqual(root.right.left.value, 6) |
|
self.assertEqual(root.right.right.value, 7) |
|
|
|
# Test tree with None values |
|
# 1 |
|
# / \\ |
|
# 2 3 |
|
# / / |
|
# 4 6 |
|
values = [1, 2, 3, 4, None, 6, None] |
|
root = build_binary_tree(values) |
|
self.assertEqual(root.value, 1) |
|
self.assertEqual(root.left.value, 2) |
|
self.assertEqual(root.right.value, 3) |
|
self.assertEqual(root.left.left.value, 4) |
|
self.assertIsNone(root.left.right) |
|
self.assertEqual(root.right.left.value, 6) |
|
self.assertIsNone(root.right.right) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
else: |
|
|
|
return """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import {0} |
|
|
|
class Test{1}(unittest.TestCase): |
|
def test_{0}(self): |
|
# TODO: Add specific test cases for {0} |
|
# This is a placeholder test |
|
self.assertTrue(True) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""".format(func_name, func_name.title()) |
|
|
|
def _load_test_templates(self) -> Dict[str, str]: |
|
""" |
|
Load test templates for common functions. |
|
|
|
Returns: |
|
Dictionary of test templates |
|
""" |
|
|
|
return { |
|
"calculate_sum": """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import calculate_sum |
|
|
|
class TestCalculateSum(unittest.TestCase): |
|
def test_calculate_sum(self): |
|
self.assertEqual(calculate_sum([1, 2, 3, 4, 5]), 15) |
|
self.assertEqual(calculate_sum([]), 0) |
|
self.assertEqual(calculate_sum([-1, -2, -3]), -6) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""", |
|
"calculate_average": """ |
|
import unittest |
|
import sys |
|
import os |
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
from solution import calculate_average |
|
|
|
class TestCalculateAverage(unittest.TestCase): |
|
def test_calculate_average(self): |
|
self.assertEqual(calculate_average([1, 2, 3, 4, 5]), 3) |
|
self.assertEqual(calculate_average([]), 0) |
|
self.assertEqual(calculate_average([10]), 10) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
""" |
|
} |
|
|
|
def _insert_bug(self, problem_state: ProblemState, bug_category: str) -> None: |
|
""" |
|
Insert a bug of the specified category into the problem state. |
|
|
|
Args: |
|
problem_state: The problem state to modify |
|
bug_category: The category of bug to insert |
|
""" |
|
if "code" not in problem_state.code_context: |
|
return |
|
|
|
|
|
code = problem_state.code_context["code"] |
|
try: |
|
parsed_code = ast.parse(code) |
|
except SyntaxError: |
|
|
|
return |
|
|
|
|
|
if bug_category == BugCategory.SYNTAX: |
|
self._insert_syntax_bug(problem_state) |
|
elif bug_category == BugCategory.LOGICAL: |
|
self._insert_logical_bug(problem_state) |
|
elif bug_category == BugCategory.PERFORMANCE: |
|
self._insert_performance_bug(problem_state) |
|
elif bug_category == BugCategory.EDGE_CASE: |
|
self._insert_edge_case_bug(problem_state) |
|
else: |
|
|
|
self._insert_logical_bug(problem_state) |
|
|
|
|
|
if "bug_count" not in problem_state.code_context: |
|
problem_state.code_context["bug_count"] = 0 |
|
problem_state.code_context["bug_count"] += 1 |
|
|
|
if "bug_categories" not in problem_state.code_context: |
|
problem_state.code_context["bug_categories"] = [] |
|
if bug_category not in problem_state.code_context["bug_categories"]: |
|
problem_state.code_context["bug_categories"].append(bug_category) |
|
|
|
def _insert_syntax_bug(self, problem_state: ProblemState) -> None: |
|
""" |
|
Insert a syntax bug into the problem state. |
|
|
|
Args: |
|
problem_state: The problem state to modify |
|
""" |
|
code = problem_state.code_context["code"] |
|
lines = code.split('\n') |
|
if not lines: |
|
return |
|
|
|
|
|
idx = random.randint(0, len(lines) - 1) |
|
line = lines[idx] |
|
|
|
|
|
attempts = 0 |
|
while (not line.strip() or line.strip().startswith('#')) and attempts < 10: |
|
idx = random.randint(0, len(lines) - 1) |
|
line = lines[idx] |
|
attempts += 1 |
|
|
|
if attempts >= 10: |
|
|
|
for i, line in enumerate(lines): |
|
if line.strip() and not line.strip().startswith('#'): |
|
idx = i |
|
break |
|
else: |
|
return |
|
|
|
|
|
mod_type = random.choice([ |
|
"remove_character", |
|
"add_character", |
|
"swap_characters", |
|
"change_indent" |
|
]) |
|
|
|
if mod_type == "remove_character" and line: |
|
char_idx = random.randint(0, len(line) - 1) |
|
lines[idx] = line[:char_idx] + line[char_idx+1:] |
|
|
|
elif mod_type == "add_character": |
|
char_idx = random.randint(0, len(line)) |
|
char = random.choice(["(", ")", "{", "}", "[", "]", ":", ";", ",", "."]) |
|
lines[idx] = line[:char_idx] + char + line[char_idx:] |
|
|
|
elif mod_type == "swap_characters" and len(line) >= 2: |
|
char_idx = random.randint(0, len(line) - 2) |
|
lines[idx] = (line[:char_idx] + line[char_idx+1] + |
|
line[char_idx] + line[char_idx+2:]) |
|
|
|
elif mod_type == "change_indent": |
|
|
|
if line.startswith(" "): |
|
lines[idx] = line[2:] |
|
else: |
|
lines[idx] = " " + line |
|
|
|
|
|
problem_state.code_context["code"] = '\n'.join(lines) |
|
|
|
|
|
if "bugs" not in problem_state.code_context: |
|
problem_state.code_context["bugs"] = [] |
|
|
|
problem_state.code_context["bugs"].append({ |
|
"type": BugCategory.SYNTAX, |
|
"line": idx + 1, |
|
"description": f"Syntax error introduced in line {idx + 1}" |
|
}) |
|
|
|
def _insert_logical_bug(self, problem_state: ProblemState) -> None: |
|
""" |
|
Insert a logical bug into the problem state. |
|
|
|
Args: |
|
problem_state: The problem state to modify |
|
""" |
|
code = problem_state.code_context["code"] |
|
lines = code.split('\n') |
|
if not lines: |
|
return |
|
|
|
|
|
if_statements = [] |
|
for i, line in enumerate(lines): |
|
if re.search(r'\bif\b|\bwhile\b|\bfor\b', line): |
|
if_statements.append((i, line)) |
|
|
|
|
|
mod_type = random.choice([ |
|
"change_comparison", |
|
"invert_condition", |
|
"off_by_one", |
|
"change_operator", |
|
"reverse_logic" |
|
]) |
|
|
|
if if_statements: |
|
|
|
idx, line = random.choice(if_statements) |
|
|
|
if mod_type == "change_comparison": |
|
|
|
comparisons = {"==": "!=", "!=": "==", ">": "<", "<": ">", ">=": "<=", "<=": ">="} |
|
for op, new_op in comparisons.items(): |
|
if op in line: |
|
lines[idx] = line.replace(op, new_op, 1) |
|
break |
|
|
|
elif mod_type == "invert_condition": |
|
|
|
if "not" in line: |
|
lines[idx] = line.replace("not ", "", 1) |
|
else: |
|
match = re.search(r'(if|while)\s+([^:]+):', line) |
|
if match: |
|
condition = match.group(2) |
|
lines[idx] = line.replace(condition, f"not ({condition})", 1) |
|
|
|
elif mod_type == "off_by_one": |
|
|
|
for op in ["+", "-"]: |
|
if op in line: |
|
|
|
match = re.search(f'\\{op}\\s*(\\d+)', line) |
|
if match: |
|
num = int(match.group(1)) |
|
new_num = num + 1 if op == "+" else max(0, num - 1) |
|
lines[idx] = line.replace(f"{op} {num}", f"{op} {new_num}", 1) |
|
break |
|
|
|
elif mod_type == "change_operator": |
|
|
|
operators = {"+": "-", "-": "+", "*": "/", "/": "*", "and": "or", "or": "and"} |
|
for op, new_op in operators.items(): |
|
if f" {op} " in line: |
|
lines[idx] = line.replace(f" {op} ", f" {new_op} ", 1) |
|
break |
|
|
|
elif mod_type == "reverse_logic": |
|
|
|
if " and " in line: |
|
parts = line.split(" and ") |
|
lines[idx] = line.replace(" and ".join(parts), " or ".join(parts), 1) |
|
elif " or " in line: |
|
parts = line.split(" or ") |
|
lines[idx] = line.replace(" or ".join(parts), " and ".join(parts), 1) |
|
|
|
else: |
|
|
|
|
|
assignments = [] |
|
for i, line in enumerate(lines): |
|
if "=" in line and "==" not in line and "!=" not in line: |
|
assignments.append((i, line)) |
|
|
|
if assignments: |
|
|
|
idx, line = random.choice(assignments) |
|
|
|
|
|
if "+" in line: |
|
lines[idx] = line.replace("+", "-", 1) |
|
elif "-" in line: |
|
lines[idx] = line.replace("-", "+", 1) |
|
elif "*" in line: |
|
lines[idx] = line.replace("*", "/", 1) |
|
elif "/" in line: |
|
lines[idx] = line.replace("/", "*", 1) |
|
else: |
|
|
|
match = re.search(r'=\s*(\d+)', line) |
|
if match: |
|
num = int(match.group(1)) |
|
new_num = num + random.choice([-1, 1]) * random.randint(1, 3) |
|
lines[idx] = line.replace(f"= {num}", f"= {new_num}", 1) |
|
|
|
|
|
problem_state.code_context["code"] = '\n'.join(lines) |
|
|
|
|
|
if "bugs" not in problem_state.code_context: |
|
problem_state.code_context["bugs"] = [] |
|
|
|
problem_state.code_context["bugs"].append({ |
|
"type": BugCategory.LOGICAL, |
|
"line": idx + 1, |
|
"description": f"Logical error introduced in line {idx + 1}" |
|
}) |
|
|
|
def _insert_performance_bug(self, problem_state: ProblemState) -> None: |
|
""" |
|
Insert a performance bug into the problem state. |
|
|
|
Args: |
|
problem_state: The problem state to modify |
|
""" |
|
code = problem_state.code_context["code"] |
|
lines = code.split('\n') |
|
if not lines: |
|
return |
|
|
|
|
|
functions = [] |
|
current_func = None |
|
func_start = None |
|
for i, line in enumerate(lines): |
|
if line.strip().startswith("def "): |
|
if current_func: |
|
functions.append((func_start, i - 1, current_func)) |
|
current_func = line.strip()[4:].split("(")[0] |
|
func_start = i |
|
elif i == len(lines) - 1 and current_func: |
|
functions.append((func_start, i, current_func)) |
|
|
|
if not functions: |
|
return |
|
|
|
|
|
start_idx, end_idx, func_name = random.choice(functions) |
|
|
|
|
|
mod_type = random.choice([ |
|
"add_nested_loop", |
|
"inefficient_data_structure", |
|
"redundant_computation" |
|
]) |
|
|
|
if mod_type == "add_nested_loop": |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip(): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
break |
|
else: |
|
indent = 4 |
|
|
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if "for " in lines[i] or "while " in lines[i]: |
|
|
|
inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
|
inner_indent_str = ' ' * inner_indent |
|
|
|
|
|
lines.insert(i + 1, f"{inner_indent_str}for _ in range(100): # Inefficient nested loop") |
|
lines.insert(i + 2, f"{inner_indent_str} pass") |
|
|
|
|
|
end_idx += 2 |
|
break |
|
else: |
|
|
|
inner_indent = indent + 4 |
|
inner_indent_str = ' ' * inner_indent |
|
|
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
|
|
|
lines.insert(i, f"{' ' * indent}for i in range(100): # Inefficient loop") |
|
lines.insert(i + 1, f"{inner_indent_str}pass") |
|
|
|
|
|
end_idx += 2 |
|
break |
|
|
|
elif mod_type == "ineff |
|
# recursive_swe_bench/task_generators/bug_fixing.py (finalized) |
|
|
|
elif mod_type == "inefficient_data_structure": |
|
# Find indentation of the function |
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip(): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
break |
|
else: |
|
indent = 4 |
|
|
|
# Find a suitable place to add inefficient data structure usage |
|
for i in range(start_idx + 1, end_idx + 1): |
|
if "def " not in lines[i] and lines[i].strip(): |
|
# Add inefficient data structure usage after this line |
|
indent_str = ' ' * indent |
|
|
|
# Add inefficient code |
|
lines.insert(i + 1, f"{indent_str} |
|
lines.insert(i + 2, f"{indent_str}results = []") |
|
lines.insert(i + 3, f"{indent_str}for i in range(1000): # Unnecessarily large range") |
|
lines.insert(i + 4, f"{indent_str} # Using list instead of set for lookups") |
|
lines.insert(i + 5, f"{indent_str} if i % 10 in results: # O(n) lookup instead of O(1)") |
|
lines.insert(i + 6, f"{indent_str} results.append(i) # Unnecessary storage") |
|
|
|
|
|
end_idx += 6 |
|
break |
|
|
|
elif mod_type == "redundant_computation": |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip(): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
break |
|
else: |
|
indent = 4 |
|
|
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if "for " in lines[i] or "while " in lines[i]: |
|
|
|
inner_indent = len(lines[i]) - len(lines[i].lstrip()) + 4 |
|
inner_indent_str = ' ' * inner_indent |
|
|
|
|
|
lines.insert(i + 1, f"{inner_indent_str}# Redundant computation in each iteration") |
|
lines.insert(i + 2, f"{inner_indent_str}temp_sum = 0") |
|
lines.insert(i + 3, f"{inner_indent_str}for j in range(100): # Unnecessary nested computation") |
|
lines.insert(i + 4, f"{inner_indent_str} temp_sum += j") |
|
|
|
|
|
end_idx += 4 |
|
break |
|
|
|
|
|
problem_state.code_context["code"] = '\n'.join(lines) |
|
|
|
|
|
if "bugs" not in problem_state.code_context: |
|
problem_state.code_context["bugs"] = [] |
|
|
|
problem_state.code_context["bugs"].append({ |
|
"type": BugCategory.PERFORMANCE, |
|
"line": start_idx + 1, |
|
"description": f"Performance issue introduced in function '{func_name}'" |
|
}) |
|
|
|
def _insert_edge_case_bug(self, problem_state: ProblemState) -> None: |
|
""" |
|
Insert an edge case bug into the problem state. |
|
|
|
Args: |
|
problem_state: The problem state to modify |
|
""" |
|
code = problem_state.code_context["code"] |
|
lines = code.split('\n') |
|
if not lines: |
|
return |
|
|
|
|
|
functions = [] |
|
current_func = None |
|
func_start = None |
|
for i, line in enumerate(lines): |
|
if line.strip().startswith("def "): |
|
if current_func: |
|
functions.append((func_start, i - 1, current_func)) |
|
current_func = line.strip()[4:].split("(")[0] |
|
func_start = i |
|
elif i == len(lines) - 1 and current_func: |
|
functions.append((func_start, i, current_func)) |
|
|
|
if not functions: |
|
return |
|
|
|
|
|
start_idx, end_idx, func_name = random.choice(functions) |
|
|
|
|
|
mod_type = random.choice([ |
|
"remove_boundary_check", |
|
"missing_edge_case", |
|
"type_assumption" |
|
]) |
|
|
|
if mod_type == "remove_boundary_check": |
|
|
|
boundary_checks = [] |
|
for i in range(start_idx + 1, end_idx + 1): |
|
if (re.search(r'if\s+.*(len|empty|<=|>=|<|>|==|!=)', lines[i]) and |
|
(("if not " in lines[i]) or ("if len(" in lines[i]) or |
|
("if " in lines[i] and " == 0" in lines[i]) or |
|
("if " in lines[i] and " == []" in lines[i]) or |
|
("if " in lines[i] and " == ''" in lines[i]) or |
|
("if " in lines[i] and " is None" in lines[i]))): |
|
boundary_checks.append(i) |
|
|
|
if boundary_checks: |
|
|
|
idx = random.choice(boundary_checks) |
|
|
|
|
|
lines[idx] = f"# {lines[idx]} # Boundary check removed" |
|
|
|
|
|
i = idx + 1 |
|
while i <= end_idx and (not lines[i].strip() or len(lines[i]) - len(lines[i].lstrip()) > len(lines[idx]) - len(lines[idx].lstrip())): |
|
lines[i] = f"# {lines[i]}" |
|
i += 1 |
|
else: |
|
|
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
|
|
|
|
lines.insert(i, f"{indent_str}# Missing check for empty input") |
|
lines.insert(i + 1, f"{indent_str}first_item = items[0] # Will fail on empty input") |
|
|
|
|
|
end_idx += 2 |
|
break |
|
|
|
elif mod_type == "missing_edge_case": |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if ("/" in lines[i] or |
|
"if " in lines[i] and "==" in lines[i] or |
|
"if " in lines[i] and "!=" in lines[i]): |
|
|
|
if "/" in lines[i] and not re.search(r'if\s+.*!=\s*0', lines[i-1]): |
|
|
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
|
|
|
|
match = re.search(r'/\s*(\w+)', lines[i]) |
|
if match: |
|
denominator = match.group(1) |
|
|
|
|
|
j = i - 1 |
|
while j >= start_idx and len(lines[j]) - len(lines[j].lstrip()) >= indent: |
|
if f"if {denominator}" in lines[j] and "== 0" in lines[j]: |
|
lines[j] = f"# {lines[j]} # Zero division check removed" |
|
j -= 1 |
|
|
|
|
|
lines.insert(i, f"{indent_str}# Missing check for zero division") |
|
|
|
|
|
end_idx += 1 |
|
break |
|
|
|
elif ("==" in lines[i] or "!=" in lines[i]) and "None" not in lines[i]: |
|
|
|
lines[i] = f"# {lines[i]} # Edge case check removed" |
|
break |
|
else: |
|
|
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
|
|
|
|
lines.insert(i, f"{indent_str}# Missing handling for edge cases") |
|
lines.insert(i + 1, f"{indent_str}# This function doesn't handle special cases properly") |
|
|
|
|
|
end_idx += 2 |
|
break |
|
|
|
elif mod_type == "type_assumption": |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if re.search(r'for\s+\w+\s+in\s+\w+', lines[i]) or "=" in lines[i] and "[" in lines[i]: |
|
|
|
var_match = re.search(r'for\s+\w+\s+in\s+(\w+)', lines[i]) |
|
if not var_match: |
|
var_match = re.search(r'(\w+)\s*=', lines[i]) |
|
|
|
if var_match: |
|
var_name = var_match.group(1) |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
|
|
|
|
lines.insert(i + 1, f"{indent_str}# Type assumption: {var_name} is assumed to be a list") |
|
lines.insert(i + 2, f"{indent_str}if len({var_name}) > 0: # Will fail if {var_name} doesn't support len()") |
|
lines.insert(i + 3, f"{indent_str} first = {var_name}[0] # Will fail if {var_name} is not subscriptable") |
|
|
|
|
|
end_idx += 3 |
|
break |
|
else: |
|
|
|
for i in range(start_idx + 1, end_idx + 1): |
|
if lines[i].strip() and not (lines[i].strip().startswith('"""') or lines[i].strip().startswith("'''")): |
|
indent = len(lines[i]) - len(lines[i].lstrip()) |
|
indent_str = ' ' * indent |
|
|
|
|
|
param_match = re.search(r'def\s+\w+\s*\(\s*(\w+)', lines[start_idx]) |
|
param_name = param_match.group(1) if param_match else "input_data" |
|
|
|
|
|
lines.insert(i, f"{indent_str}# Type assumption: {param_name} is assumed to be a specific type") |
|
lines.insert(i + 1, f"{indent_str}{param_name}_str = str({param_name}) # Will fail if {param_name} can't be converted to string") |
|
|
|
|
|
end_idx += 2 |
|
break |
|
|
|
|
|
problem_state.code_context["code"] = '\n'.join(lines) |
|
|
|
|
|
if "bugs" not in problem_state.code_context: |
|
problem_state.code_context["bugs"] = [] |
|
|
|
problem_state.code_context["bugs"].append({ |
|
"type": BugCategory.EDGE_CASE, |
|
"line": start_idx + 1, |
|
"description": f"Edge case bug introduced in function '{func_name}'" |
|
}) |
|
|
|
def _generate_description(self, problem_state: ProblemState) -> str: |
|
""" |
|
Generate a description for the current problem state. |
|
|
|
Args: |
|
problem_state: The problem state |
|
|
|
Returns: |
|
A descriptive prompt for the problem |
|
""" |
|
|
|
bug_count = problem_state.code_context.get("bug_count", 0) |
|
plural = "bugs" if bug_count != 1 else "bug" |
|
|
|
base_desc = ( |
|
f"Fix the {plural} in the code below. " |
|
f"There {'are' if bug_count != 1 else 'is'} {bug_count} {plural} to find and fix." |
|
) |
|
|
|
|
|
if "bug_categories" in problem_state.code_context: |
|
categories = problem_state.code_context["bug_categories"] |
|
if categories: |
|
category_desc = ", ".join(categories) |
|
base_desc += f"\n\nThe code contains the following types of issues: {category_desc}." |
|
|
|
|
|
if problem_state.requirements: |
|
base_desc += "\n\nRequirements:" |
|
for i, req in enumerate(problem_state.requirements): |
|
base_desc += f"\n{i+1}. {req['description']}" |
|
|
|
|
|
difficulty_desc = "easy" |
|
if problem_state.difficulty > 0.3 and problem_state.difficulty <= 0.6: |
|
difficulty_desc = "moderate" |
|
elif problem_state.difficulty > 0.6 and problem_state.difficulty <= 0.8: |
|
difficulty_desc = "challenging" |
|
elif problem_state.difficulty > 0.8: |
|
difficulty_desc = "very challenging" |
|
|
|
base_desc += f"\n\nThis is a {difficulty_desc} bug fixing task." |
|
|
|
return base_desc |
|
|
|
|
|
|
|
class DefaultTestRunner: |
|
""" |
|
Default test runner for evaluating solutions. |
|
|
|
This class runs tests against a solution file and collects the results. |
|
""" |
|
|
|
def run_tests( |
|
self, |
|
solution_file: Path, |
|
test_files: List[Path], |
|
code_context: Dict[str, Any] |
|
) -> Dict[str, Any]: |
|
""" |
|
Run tests against a solution file. |
|
|
|
Args: |
|
solution_file: Path to the solution file |
|
test_files: List of test file paths |
|
code_context: Additional context about the code |
|
|
|
Returns: |
|
Dictionary containing test results |
|
""" |
|
|
|
results = { |
|
"all_passed": True, |
|
"passed_tests": 0, |
|
"total_tests": 0, |
|
"tests": {}, |
|
"execution": { |
|
"success": True, |
|
"error": None, |
|
"stdout": "", |
|
"stderr": "" |
|
}, |
|
"execution_time": 0.0 |
|
} |
|
|
|
|
|
if not solution_file.exists(): |
|
results["execution"]["success"] = False |
|
results["execution"]["error"] = f"Solution file not found: {solution_file}" |
|
results["all_passed"] = False |
|
return results |
|
|
|
|
|
try: |
|
start_time = time.time() |
|
|
|
|
|
sys.path.insert(0, str(solution_file.parent)) |
|
|
|
|
|
spec = importlib.util.spec_from_file_location( |
|
"solution", solution_file) |
|
solution_module = importlib.util.module_from_spec(spec) |
|
spec.loader.exec_module(solution_module) |
|
|
|
|
|
sys.path.pop(0) |
|
|
|
|
|
end_time = time.time() |
|
results["execution_time"] = end_time - start_time |
|
|
|
except Exception as e: |
|
results["execution"]["success"] = False |
|
results["execution"]["error"] = str(e) |
|
results["all_passed"] = False |
|
return results |
|
|
|
|
|
for test_file in test_files: |
|
|
|
if not test_file.exists(): |
|
continue |
|
|
|
try: |
|
|
|
loader = unittest.TestLoader() |
|
|
|
|
|
sys.path.insert(0, str(test_file.parent)) |
|
|
|
|
|
stdout_buffer = io.StringIO() |
|
stderr_buffer = io.StringIO() |
|
|
|
|
|
test_suite = loader.discover( |
|
str(test_file.parent), |
|
pattern=test_file.name |
|
) |
|
|
|
|
|
test_count = 0 |
|
for suite in test_suite: |
|
for test_case in suite: |
|
test_count += test_case.countTestCases() |
|
|
|
results["total_tests"] += test_count |
|
|
|
|
|
with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): |
|
test_runner = unittest.TextTestRunner(verbosity=2) |
|
test_result = test_runner.run(test_suite) |
|
|
|
|
|
stdout = stdout_buffer.getvalue() |
|
stderr = stderr_buffer.getvalue() |
|
|
|
|
|
sys.path.pop(0) |
|
|
|
|
|
if not test_result.wasSuccessful(): |
|
results["all_passed"] = False |
|
|
|
|
|
passed_tests = test_count - len(test_result.failures) - len(test_result.errors) |
|
results["passed_tests"] += passed_tests |
|
|
|
|
|
test_name = test_file.stem |
|
results["tests"][test_name] = { |
|
"passed": test_result.wasSuccessful(), |
|
"failures": len(test_result.failures), |
|
"errors": len(test_result.errors), |
|
"skipped": len(test_result.skipped), |
|
"total": test_count, |
|
"passed_count": passed_tests, |
|
"stdout": stdout, |
|
"stderr": stderr |
|
} |
|
|
|
|
|
for failure in test_result.failures + test_result.errors: |
|
test_id = failure[0].id().split('.')[-1] |
|
failure_message = failure[1] |
|
|
|
|
|
expected_match = re.search(r'Expected\s*:(.+)', failure_message) |
|
actual_match = re.search(r'Actual\s*:(.+)', failure_message) |
|
|
|
expected = expected_match.group(1).strip() if expected_match else None |
|
actual = actual_match.group(1).strip() if actual_match else None |
|
|
|
if test_id not in results["tests"]: |
|
results["tests"][test_id] = {} |
|
|
|
results["tests"][test_id].update({ |
|
"passed": False, |
|
"message": failure_message, |
|
"expected": expected, |
|
"actual": actual |
|
}) |
|
|
|
except Exception as e: |
|
|
|
results["all_passed"] = False |
|
test_name = test_file.stem |
|
results["tests"][test_name] = { |
|
"passed": False, |
|
"error": str(e), |
|
"failures": 0, |
|
"errors": 1, |
|
"skipped": 0, |
|
"total": 1, |
|
"passed_count": 0 |
|
} |
|
results["total_tests"] += 1 |
|
|
|
return results |
|
|