Spaces:

mgbam
/

StoryVerseWeaver

Sleeping

App Files Files Community

mgbam commited on May 17

Commit

993e62d

verified ·

1 Parent(s): 4e61147

Update core/generation_engine.py

Browse files

Files changed (1) hide show

core/generation_engine.py +113 -34

core/generation_engine.py CHANGED Viewed

@@ -1,46 +1,125 @@
-# algoforge_prime/core/generation_engine.py
 from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Changed to absolute
 from prompts.system_prompts import get_system_prompt # Changed to absolute
-from prompts.prompt_templates import format_genesis_user_prompt # Changed to absolute
-def generate_initial_solutions(
-    problem_description,
-    initial_hints,
-    problem_type, # e.g., "Python Algorithm with Tests"
-    num_solutions_to_generate,
-    llm_client_config # Dict: {"type": ..., "model_id": ..., "temp": ..., "max_tokens": ...}
-):
-    solutions_or_errors = []
-    system_p_key = "genesis_general"
-    if "python" in problem_type.lower():
-        system_p_key = "genesis_python"
-    system_p_genesis = get_system_prompt(system_p_key) # Uses the imported function
-    for i in range(num_solutions_to_generate):
-        user_p_genesis = format_genesis_user_prompt( # Uses the imported function
-            problem_description, initial_hints, i + 1, num_solutions_to_generate
-        )
         llm_response_obj = None # type: LLMResponse
         if llm_client_config["type"] == "hf":
-            llm_response_obj = call_huggingface_api( # Uses the imported function
-                user_p_genesis, llm_client_config["model_id"],
                 temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
-                system_prompt_text=system_p_genesis
             )
         elif llm_client_config["type"] == "google_gemini":
-            llm_response_obj = call_gemini_api( # Uses the imported function
-                user_p_genesis, llm_client_config["model_id"],
                 temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
-                system_prompt_text=system_p_genesis
             )
-        else:
-            solutions_or_errors.append(f"ERROR (Genesis Attempt {i+1}): Unknown LLM client type '{llm_client_config['type']}'")
-            continue
-        if llm_response_obj.success:
-            solutions_or_errors.append(llm_response_obj.text)
         else:
-            solutions_or_errors.append(f"ERROR (Genesis Attempt {i+1} with {llm_response_obj.model_id_used}): {llm_response_obj.error}")
-    return solutions_or_errors

+# algoforge_prime/core/evaluation_engine.py
+import random
+# (Keep your placeholder _placeholder_safe_python_execution as is)
 from core.llm_clients import call_huggingface_api, call_gemini_api, LLMResponse # Changed to absolute
 from prompts.system_prompts import get_system_prompt # Changed to absolute
+from prompts.prompt_templates import format_critique_user_prompt # Changed to absolute
+class EvaluationResult: # Keep this class definition
+    def __init__(self, score=0, critique_text="", passed_tests=0, total_tests=0, execution_summary=None, raw_llm_critique_response=None):
+        self.score = score
+        self.critique_text = critique_text
+        self.passed_tests = passed_tests
+        self.total_tests = total_tests
+        self.execution_summary = execution_summary
+        self.raw_llm_critique_response = raw_llm_critique_response
+    def __str__(self):
+        return f"Score: {self.score}/10. Tests: {self.passed_tests}/{self.total_tests}. Summary: {self.execution_summary}. Critique: {self.critique_text[:100]}..."
+def _parse_score_from_llm_text(llm_text_output: str) -> int: # Keep this helper
+    # ... (implementation as before) ...
+    score = 0
+    if not llm_text_output or not isinstance(llm_text_output, str): return score
+    try:
+        import re
+        match = re.search(r"Score:\s*(\d+)(?:\s*/\s*10)?", llm_text_output, re.IGNORECASE)
+        if match:
+            parsed_score_val = int(match.group(1))
+            score = max(1, min(parsed_score_val, 10))
+        else:
+            score = random.randint(3, 6)
+    except Exception: score = random.randint(3, 5)
+    return score
+def _placeholder_safe_python_execution(code_string: str, user_tests_string: str) -> tuple[int, int, str]: # Keep this placeholder
+    # ... (implementation as before) ...
+    print(f"DEV_INFO: evaluation_engine.py - Entering PLACEHOLDER for code execution.")
+    if not user_tests_string.strip() or not code_string.strip(): return 0, 0, "SIMULATED: No tests/code."
+    test_lines = [line.strip() for line in user_tests_string.splitlines() if line.strip().startswith("assert")]
+    total_tests_found = len(test_lines)
+    if total_tests_found == 0: return 0, 0, "SIMULATED: No 'assert' statements."
+    passed_count = random.randint(total_tests_found // 2, total_tests_found) # Simulate some passing
+    summary = f"Simulated: {passed_count}/{total_tests_found} tests passed."
+    if passed_count < total_tests_found: summary += " Some tests likely failed."
+    return passed_count, total_tests_found, summary
+def evaluate_solution_candidate(
+    solution_text: str,
+    problem_description: str,
+    problem_type: str,
+    user_provided_tests: str,
+    llm_client_config: dict
+) -> EvaluationResult:
+    llm_critique_output_text = "LLM critique could not be performed."
+    llm_based_score = 0
+    raw_llm_critique_resp = None
+    if solution_text and not solution_text.startswith("ERROR"):
+        system_p_critique = get_system_prompt("critique_general")
+        user_p_critique = format_critique_user_prompt(problem_description, solution_text)
         llm_response_obj = None # type: LLMResponse
         if llm_client_config["type"] == "hf":
+            llm_response_obj = call_huggingface_api(
+                user_p_critique, llm_client_config["model_id"],
                 temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
+                system_prompt_text=system_p_critique
             )
         elif llm_client_config["type"] == "google_gemini":
+            llm_response_obj = call_gemini_api(
+                user_p_critique, llm_client_config["model_id"],
                 temperature=llm_client_config["temp"], max_new_tokens=llm_client_config["max_tokens"],
+                system_prompt_text=system_p_critique
             )
+        if llm_response_obj:
+            raw_llm_critique_resp = llm_response_obj.raw_response
+            if llm_response_obj.success:
+                llm_critique_output_text = llm_response_obj.text
+                llm_based_score = _parse_score_from_llm_text(llm_critique_output_text)
+            else:
+                llm_critique_output_text = f"Error during LLM critique (Model: {llm_response_obj.model_id_used}): {llm_response_obj.error}"
+                llm_based_score = 0
+    elif solution_text and solution_text.startswith("ERROR"):
+        llm_critique_output_text = f"Solution was an error from Genesis: {solution_text}"
+        llm_based_score = 0
+    passed_tests_count = 0
+    total_tests_count = 0
+    exec_summary_msg = "Automated tests not applicable or not run."
+    if "python" in problem_type.lower() and user_provided_tests.strip() and solution_text and not solution_text.startswith("ERROR"):
+        passed_tests_count, total_tests_count, exec_summary_msg = _placeholder_safe_python_execution(
+            solution_text, user_provided_tests
+        )
+    elif "python" in problem_type.lower() and not user_provided_tests.strip():
+        exec_summary_msg = "No user tests provided for this Python problem."
+    final_score_calculated = llm_based_score
+    if total_tests_count > 0:
+        test_pass_ratio = passed_tests_count / total_tests_count
+        if test_pass_ratio < 0.5 :
+            final_score_calculated = max(1, int(llm_based_score * 0.5) - 1)
+        elif test_pass_ratio == 1.0 and passed_tests_count > 0:
+            final_score_calculated = min(10, llm_based_score + 1 if llm_based_score < 10 else 10)
         else:
+            final_score_calculated = int(llm_based_score * (0.6 + 0.4 * test_pass_ratio))
+    final_score_calculated = max(1, min(10, final_score_calculated))
+    comprehensive_critique = f"{llm_critique_output_text}"
+    if total_tests_count > 0 or ("python" in problem_type.lower() and user_provided_tests.strip()):
+        comprehensive_critique += f"\n\n**Automated Test Summary (Simulated):**\n{exec_summary_msg}\n"
+        comprehensive_critique += f"Passed: {passed_tests_count}/{total_tests_count}"
+    return EvaluationResult(
+        score=final_score_calculated,
+        critique_text=comprehensive_critique,
+        passed_tests=passed_tests_count,
+        total_tests=total_tests_count,
+        execution_summary=exec_summary_msg,
+        raw_llm_critique_response=raw_llm_critique_resp
+    )