Spaces:
Sleeping
Sleeping
Update core/generation_engine.py
Browse files- core/generation_engine.py +113 -34
core/generation_engine.py
CHANGED
|
@@ -1,46 +1,125 @@
|
|
| 1 |
-
# algoforge_prime/core/
|
|
|
|
|
|
|
|
|
|
| 2 |
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Changed to absolute
|
| 3 |
from prompts.system_prompts import get_system_prompt # Changed to absolute
|
| 4 |
-
from prompts.prompt_templates import
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
llm_response_obj = None # type: LLMResponse
|
| 25 |
if llm_client_config["type"] == "hf":
|
| 26 |
-
llm_response_obj = call_huggingface_api(
|
| 27 |
-
|
| 28 |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
| 29 |
-
system_prompt_text=
|
| 30 |
)
|
| 31 |
elif llm_client_config["type"] == "google_gemini":
|
| 32 |
-
llm_response_obj = call_gemini_api(
|
| 33 |
-
|
| 34 |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
| 35 |
-
system_prompt_text=
|
| 36 |
)
|
| 37 |
-
else:
|
| 38 |
-
solutions_or_errors.append(f"ERROR (Genesis Attempt {i+1}): Unknown LLM client type '{llm_client_config['type']}'")
|
| 39 |
-
continue
|
| 40 |
|
| 41 |
-
if llm_response_obj
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
else:
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# algoforge_prime/core/evaluation_engine.py
|
| 2 |
+
import random
|
| 3 |
+
# (Keep your placeholder _placeholder_safe_python_execution as is)
|
| 4 |
+
|
| 5 |
from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Changed to absolute
|
| 6 |
from prompts.system_prompts import get_system_prompt # Changed to absolute
|
| 7 |
+
from prompts.prompt_templates import format_critique_user_prompt # Changed to absolute
|
| 8 |
+
|
| 9 |
+
class EvaluationResult: # Keep this class definition
|
| 10 |
+
def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
|
| 11 |
+
self.score = score
|
| 12 |
+
self.critique_text = critique_text
|
| 13 |
+
self.passed_tests = passed_tests
|
| 14 |
+
self.total_tests = total_tests
|
| 15 |
+
self.execution_summary = execution_summary
|
| 16 |
+
self.raw_llm_critique_response = raw_llm_critique_response
|
| 17 |
+
|
| 18 |
+
def __str__(self):
|
| 19 |
+
return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."
|
| 20 |
+
|
| 21 |
+
def _parse_score_from_llm_text(llm_text_output: str) -> int: # Keep this helper
|
| 22 |
+
# ... (implementation as before) ...
|
| 23 |
+
score = 0
|
| 24 |
+
if not llm_text_output or not isinstance(llm_text_output, str): return score
|
| 25 |
+
try:
|
| 26 |
+
import re
|
| 27 |
+
match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
|
| 28 |
+
if match:
|
| 29 |
+
parsed_score_val = int(match.group(1))
|
| 30 |
+
score = max(1, min(parsed_score_val, 10))
|
| 31 |
+
else:
|
| 32 |
+
score = random.randint(3, 6)
|
| 33 |
+
except Exception: score = random.randint(3, 5)
|
| 34 |
+
return score
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]: # Keep this placeholder
|
| 38 |
+
# ... (implementation as before) ...
|
| 39 |
+
print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
|
| 40 |
+
if not user_tests_string.strip() or not code_string.strip(): return 0, 0, "SIMULATED: No tests/code."
|
| 41 |
+
test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
|
| 42 |
+
total_tests_found = len(test_lines)
|
| 43 |
+
if total_tests_found == 0: return 0, 0, "SIMULATED: No 'assert' statements."
|
| 44 |
+
passed_count = random.randint(total_tests_found // 2, total_tests_found) # Simulate some passing
|
| 45 |
+
summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
|
| 46 |
+
if passed_count < total_tests_found: summary += " Some tests likely failed."
|
| 47 |
+
return passed_count, total_tests_found, summary
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def evaluate_solution_candidate(
|
| 51 |
+
solution_text: str,
|
| 52 |
+
problem_description: str,
|
| 53 |
+
problem_type: str,
|
| 54 |
+
user_provided_tests: str,
|
| 55 |
+
llm_client_config: dict
|
| 56 |
+
) -> EvaluationResult:
|
| 57 |
+
llm_critique_output_text = "LLM critique could not be performed."
|
| 58 |
+
llm_based_score = 0
|
| 59 |
+
raw_llm_critique_resp = None
|
| 60 |
+
|
| 61 |
+
if solution_text and not solution_text.startswith("ERROR"):
|
| 62 |
+
system_p_critique = get_system_prompt("critique_general")
|
| 63 |
+
user_p_critique = format_critique_user_prompt(problem_description, solution_text)
|
| 64 |
|
| 65 |
llm_response_obj = None # type: LLMResponse
|
| 66 |
if llm_client_config["type"] == "hf":
|
| 67 |
+
llm_response_obj = call_huggingface_api(
|
| 68 |
+
user_p_critique, llm_client_config["model_id"],
|
| 69 |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
| 70 |
+
system_prompt_text=system_p_critique
|
| 71 |
)
|
| 72 |
elif llm_client_config["type"] == "google_gemini":
|
| 73 |
+
llm_response_obj = call_gemini_api(
|
| 74 |
+
user_p_critique, llm_client_config["model_id"],
|
| 75 |
temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
|
| 76 |
+
system_prompt_text=system_p_critique
|
| 77 |
)
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
if llm_response_obj:
|
| 80 |
+
raw_llm_critique_resp = llm_response_obj.raw_response
|
| 81 |
+
if llm_response_obj.success:
|
| 82 |
+
llm_critique_output_text = llm_response_obj.text
|
| 83 |
+
llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
|
| 84 |
+
else:
|
| 85 |
+
llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
|
| 86 |
+
llm_based_score = 0
|
| 87 |
+
elif solution_text and solution_text.startswith("ERROR"):
|
| 88 |
+
llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
|
| 89 |
+
llm_based_score = 0
|
| 90 |
+
|
| 91 |
+
passed_tests_count = 0
|
| 92 |
+
total_tests_count = 0
|
| 93 |
+
exec_summary_msg = "Automated tests not applicable or not run."
|
| 94 |
+
|
| 95 |
+
if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
|
| 96 |
+
passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
|
| 97 |
+
solution_text, user_provided_tests
|
| 98 |
+
)
|
| 99 |
+
elif "python" in problem_type.lower() and not user_provided_tests.strip():
|
| 100 |
+
exec_summary_msg = "No user tests provided for this Python problem."
|
| 101 |
+
|
| 102 |
+
final_score_calculated = llm_based_score
|
| 103 |
+
if total_tests_count > 0:
|
| 104 |
+
test_pass_ratio = passed_tests_count / total_tests_count
|
| 105 |
+
if test_pass_ratio < 0.5 :
|
| 106 |
+
final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
|
| 107 |
+
elif test_pass_ratio == 1.0 and passed_tests_count > 0:
|
| 108 |
+
final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10)
|
| 109 |
else:
|
| 110 |
+
final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
|
| 111 |
+
final_score_calculated = max(1, min(10, final_score_calculated))
|
| 112 |
+
|
| 113 |
+
comprehensive_critique = f"{llm_critique_output_text}"
|
| 114 |
+
if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()):
|
| 115 |
+
comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
|
| 116 |
+
comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"
|
| 117 |
+
|
| 118 |
+
return EvaluationResult(
|
| 119 |
+
score=final_score_calculated,
|
| 120 |
+
critique_text=comprehensive_critique,
|
| 121 |
+
passed_tests=passed_tests_count,
|
| 122 |
+
total_tests=total_tests_count,
|
| 123 |
+
execution_summary=exec_summary_msg,
|
| 124 |
+
raw_llm_critique_response=raw_llm_critique_resp
|
| 125 |
+
)
|