Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

kostis-init commited on 23 days ago

Commit

180f9fe

1 Parent(s): 3267617

add extra hf dataset for persistent storage of submissions and results

Browse files

Files changed (7) hide show

app.py +3 -241
eval.py +0 -356
src/config.py +11 -0
src/eval.py +403 -0
src/hf_utils.py +128 -0
src/ui.py +83 -0
src/utils.py +7 -0

app.py CHANGED Viewed

@@ -1,246 +1,8 @@
-import gradio as gr
-import pandas as pd
-import os
-import shutil
-from pathlib import Path
-import subprocess  # For running eval.py
-import time
-import threading  # For background tasks
-import sys
-# --- Configuration ---
-SUBMISSIONS_DIR = "submissions"
-RESULTS_DIR = "results"
-EVAL_SCRIPT_PATH = "eval.py"
-# --- Helper Functions ---
-def setup_directories():
-    """Creates the submissions and results directories if they don't exist."""
-    os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
-    os.makedirs(RESULTS_DIR, exist_ok=True)
-    if not os.listdir(RESULTS_DIR):  # Add a placeholder if results is empty
-        initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
-        if not initial_result_demo_path.exists():
-            os.makedirs(initial_result_demo_path, exist_ok=True)
-            with open(initial_result_demo_path / "summary.txt", "w") as f:
-                f.write("This is a placeholder initial result.\nScore: 0\n")
-            print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")
-def load_leaderboard_data():
-    """
-    Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
-    Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
-    """
-    if not os.path.exists(RESULTS_DIR):
-        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
-    result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]
-    leaderboard_entries = []
-    # Sort by modification time of the directory (newest first)
-    # This requires getting mtime for each directory.
-    sorted_result_dirs = sorted(
-        result_dirs,
-        key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
-        reverse=True
-    )
-    for dir_name in sorted_result_dirs:
-        entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
-        result_dir_path = Path(RESULTS_DIR) / dir_name
-        try:
-            entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
-        except Exception:
-            pass  # Directory might have been removed during scan
-        summary_file = result_dir_path / "summary.txt"
-        if summary_file.exists():
-            try:
-                with open(summary_file, "r") as f:
-                    for line in f:
-                        if line.lower().startswith("score:"):
-                            entry["Score"] = line.split(":", 1)[1].strip()
-                            break
-            except Exception as e:
-                print(f"Error parsing summary for {dir_name}: {e}")
-        leaderboard_entries.append(entry)
-    if not leaderboard_entries:
-        return pd.DataFrame(columns=["Result Directory", "Score", "Files"])
-    return pd.DataFrame(leaderboard_entries)
-def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
-    """
-    This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
-    Outputs from eval.py will go to the console where app.py is running.
-    """
-    print(
-        f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")
-    if not Path(EVAL_SCRIPT_PATH).exists():
-        print(
-            f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
-        return
-    command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]
-    try:
-        # Using subprocess.run which is simpler for blocking calls within this thread
-        process = subprocess.run(
-            command,
-            capture_output=True,
-            text=True,
-            check=False,  # Handle non-zero exit codes manually
-            timeout=300  # 5-minute timeout for the evaluation script
-        )
-        eval_output = process.stdout.strip()
-        eval_error = process.stderr.strip()
-        print(
-            f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
-        if eval_error:  # Only print stderr if it's not empty
-            print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")
-        if process.returncode == 0:
-            print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
-        else:
-            print(
-                f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")
-    except subprocess.TimeoutExpired:
-        print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
-    except FileNotFoundError:  # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
-        print(
-            f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
-    except Exception as e:
-        print(
-            f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")
-    print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")
-def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
-    """
-    Handles directory upload, saves files, and starts eval.py in a background thread.
-    Yields a status message for the UI. The leaderboard updates separately.
-    """
-    yield "Processing upload..."  # Initial status
-    if not uploaded_files_list:
-        yield "No directory uploaded. Please select a directory."
-        return
-    try:
-        # Determine original uploaded directory name
-        first_temp_file_path = Path(uploaded_files_list[0].name)
-        original_uploaded_dir_name = first_temp_file_path.parent.name
-        submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name
-        # Handle potential name collision
-        if submission_dir_path.exists():
-            timestamp = time.strftime("%Y%m%d-%H%M%S")
-            descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
-            submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
-            status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
-            original_uploaded_dir_name = descriptive_name_for_log_and_status  # Use new name for logging
-        else:
-            descriptive_name_for_log_and_status = original_uploaded_dir_name
-            status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."
-        os.makedirs(submission_dir_path, exist_ok=True)
-        progress(0.1, desc=status_update_msg)
-        for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
-            temp_file_path = Path(temp_file_obj.name)
-            file_name_in_dir = temp_file_path.name
-            target_file_path = submission_dir_path / file_name_in_dir
-            shutil.copy(str(temp_file_path), str(target_file_path))
-        upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
-        progress(0.8, desc=upload_completion_msg)
-    except Exception as e:
-        yield f"Error during upload: {str(e)}"
-        return
-    # --- Start evaluation in a background thread ---
-    if not Path(EVAL_SCRIPT_PATH).exists():
-        yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
-        return
-    # Ensure paths passed to thread are absolute strings, good practice for threads.
-    abs_submission_path = str(submission_dir_path.resolve())
-    abs_results_path = str(Path(RESULTS_DIR).resolve())
-    eval_thread = threading.Thread(
-        target=run_evaluation_in_background,
-        args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
-        daemon=True  # Set as daemon so it exits when main app exits
-    )
-    eval_thread.start()
-    final_status_msg = (
-        f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
-        "The leaderboard will auto-refresh (or use manual refresh)."
-    )
-    progress(1.0, desc="Background evaluation initiated.")
-    yield final_status_msg
-# --- Create Directories ---
 setup_directories()
-# --- Gradio App Definition ---
-with gr.Blocks(title="CP-Bench Leaderboard") as demo:
-    gr.Markdown(
-        """
-        # CP-Bench Leaderboard
-        This is a leaderboard for the CP-Bench dataset. You can upload your submission directory for evaluation.
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):  # Upload Column
-            gr.Markdown("## 📤 Upload Submission")
-            upload_button = gr.UploadButton(
-                "Click to Upload Directory for Evaluation",
-                file_count="directory",
-            )
-            upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)
-        with gr.Column(scale=3):  # Leaderboard Column
-            gr.Markdown("## 🏆 Results Leaderboard")
-            leaderboard_df_component = gr.DataFrame(
-                value=load_leaderboard_data,  # Load initial data
-                label="Leaderboard (auto-refreshes)",
-                interactive=False,
-                # every=20  # Auto-refresh leaderboard data every 20 seconds
-            )
-            refresh_leaderboard_button = gr.Button("🔄 Refresh Leaderboard Manually")
-    # --- Event Handlers ---
-    upload_button.upload(
-        fn=handle_upload_and_kickoff_eval,
-        inputs=[upload_button],
-        outputs=[upload_status_textbox],  # Only one output now for the status message
-        show_progress="full"
-    )
-    refresh_leaderboard_button.click(
-        fn=load_leaderboard_data,
-        inputs=None,
-        outputs=[leaderboard_df_component]
-    )
 if __name__ == "__main__":
     demo.queue().launch()

+from src.ui import create_ui
+from src.utils import setup_directories
 setup_directories()
 if __name__ == "__main__":
+    demo = create_ui()
     demo.queue().launch()

eval.py DELETED Viewed

@@ -1,356 +0,0 @@
-# eval.py
-import sys
-import os
-import time
-import json
-import subprocess
-import tempfile
-from pathlib import Path
-from datasets import load_dataset  # Hugging Face datasets library
-# --- Configuration ---
-DATASET_NAME = "kostis-init/CP-Bench"
-# Column names in the Hugging Face dataset for problem identifier and model script
-PROBLEM_NAME_COLUMN = "id"
-MODEL_CODE_COLUMN = "model"
-# Timeout for running individual model scripts (both generated and modified ground-truth)
-SCRIPT_EXECUTION_TIMEOUT = 60  # seconds
-def extract_json_from_string(text_output: str):
-    """
-    Attempts to find and parse the first valid JSON object or array from a string.
-    Handles cases where JSON is preceded or followed by non-JSON text.
-    """
-    idx = 0
-    while idx < len(text_output):
-        # Find the next potential start of a JSON structure
-        start_brace = text_output.find('{', idx)
-        start_bracket = text_output.find('[', idx)
-        if start_brace == -1 and start_bracket == -1:
-            # No more '{' or '[' found in the rest of the string
-            return None
-        # Determine the actual starting character for this attempt
-        if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
-            json_start_index = start_brace
-        else:
-            json_start_index = start_bracket
-        potential_json_segment = text_output[json_start_index:]
-        try:
-            # Use raw_decode to parse the first valid JSON object from the segment
-            decoder = json.JSONDecoder()
-            json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
-            # Successfully parsed a JSON object
-            return json_obj
-        except json.JSONDecodeError:
-            # This segment (starting at json_start_index) wasn't a valid JSON.
-            # Advance the search index past the character that caused the current attempt.
-            idx = json_start_index + 1
-    return None  # No valid JSON found in the entire string
-def run_instance(instance_path_str: str,
-                 timeout: int = SCRIPT_EXECUTION_TIMEOUT):  # SCRIPT_EXECUTION_TIMEOUT should be defined
-    """Run the instance file and robustly capture the JSON output."""
-    command = [sys.executable, instance_path_str]
-    instance_name = Path(instance_path_str).name
-    try:
-        result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
-                                errors='replace')
-        # Check return code first
-        if result.returncode != 0:
-            # Log stderr for debugging if the script itself failed
-            error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
-            print(f"  ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
-            return None
-        # Attempt to extract JSON from stdout
-        stdout_text = result.stdout
-        if not stdout_text or not stdout_text.strip():
-            print(f"  ERROR: No stdout from {instance_name}.", flush=True)
-            return None
-        solution = extract_json_from_string(stdout_text)
-        if solution is None:
-            # Be more verbose if JSON extraction fails
-            abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300]  # Show newlines as \n for brevity
-            print(
-                f"  ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
-                flush=True)
-            return None
-        return solution
-    except subprocess.TimeoutExpired:
-        print(f"  ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
-        return None
-    except Exception as e:
-        print(f"  ERROR: Unexpected error running {instance_name}: {e}", flush=True)
-        return None
-def add_constraints_as_string(solution):
-    """Generate constraints as a string to be added to the original script."""
-    constraints = ""
-    if solution:  # Ensure solution is not None
-        for key, value in solution.items():
-            # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
-            if isinstance(value, str):
-                constraints += f"\nmodel += ({key} == \"{value}\")"
-            else:
-                constraints += f"\nmodel += ({key} == {value})"
-    return constraints
-def get_modified_script(script_content, solution):
-    """Add constraints to the script content and self-consistency checks."""
-    constraints_str = add_constraints_as_string(solution)
-    modified_script = f"{script_content}\n{constraints_str}"
-    modified_script += """
-# --- Self-consistency check appended by eval.py ---
-# Print the absolute path of the current directory along with the script name
-import os
-# print(f"DEBUG: Running modified script: {os.path.abspath(__file__)}") # Optional debug
-# Keep old objective
-old_objective_value = None
-objective_defined = False
-if 'model' in locals() and hasattr(model, 'objective_value') and callable(model.objective_value):
-    try:
-        # This block assumes 'model' is the CPMpy model object or similar
-        # Check if an objective is set. Some libraries might not have a direct 'objective_is_min/max'
-        # or might raise an error if objective_value() is called on an unsolved/unformulated objective.
-        # This part might need adjustment based on the specific modeling library used in CP-Bench.
-        # For now, we'll try to get it and catch errors.
-        # A more robust way might be to inspect model.objective_
-        if hasattr(model, '_objective_value'): # cpmpy specific check if objective was set
-             if model._objective_value is not None: # cpmpy does not have objective_is_min
-                objective_defined = True
-                old_objective_value = model.objective_value()
-    except Exception as e_obj_check:
-        # print(f"DEBUG: Could not retrieve initial objective value: {e_obj_check}")
-        pass # Objective might not be set or model not solved yet.
-# Check self-consistency
-solved_ok = False
-try:
-    if 'model' in locals() and hasattr(model, 'solve') and callable(model.solve):
-        solved_ok = model.solve()
-    else:
-        print('ERROR: Model object not found or does not have a solve() method.')
-except Exception as e_solve:
-    print(f'ERROR: Exception during model.solve(): {e_solve}')
-    solved_ok = False # Ensure it's false on exception
-if not solved_ok:
-    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE')
-else:
-    print('EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS')
-    # Check if the objective value is the same
-    if not objective_defined:
-        print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED')
-    else:
-        try:
-            current_objective_value = model.objective_value()
-            # Handle potential floating point inaccuracies if objectives can be floats
-            if isinstance(old_objective_value, float) or isinstance(current_objective_value, float):
-                if abs(current_objective_value - old_objective_value) < 1e-6: # Tolerance for float comparison
-                    print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
-                else:
-                    print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
-            elif current_objective_value != old_objective_value: # Integer comparison
-                print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED (Old: {old_objective_value}, New: {current_objective_value})')
-            else:
-                print('EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT')
-        except Exception as e_obj_final:
-            print(f'EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE ({e_obj_final})')
-"""
-    return modified_script
-# --- Main Evaluation Logic ---
-def main(submission_path_str: str, results_base_dir_str: str):
-    start_time = time.time()
-    print(f"eval.py: Starting evaluation for submission at '{submission_path_str}'", flush=True)
-    print(f"eval.py: Results will be saved relative to '{results_base_dir_str}'", flush=True)
-    print(f"eval.py: Loading ground-truth dataset '{DATASET_NAME}' from Hugging Face.", flush=True)
-    submission_path = Path(submission_path_str)
-    submission_name = submission_path.name
-    result_dir_for_submission = Path(results_base_dir_str) / f"{submission_name}_result"
-    os.makedirs(result_dir_for_submission, exist_ok=True)
-    summary_file_path = result_dir_for_submission / "summary.txt"
-    # Load ground-truth dataset
-    try:
-        # Make sure you are authenticated with `huggingface-cli login` if the dataset is private or requires it.
-        gt_dataset = load_dataset(DATASET_NAME, split="train")
-        ground_truth_models = {
-            item[PROBLEM_NAME_COLUMN]: item[MODEL_CODE_COLUMN]
-            for item in gt_dataset
-            if PROBLEM_NAME_COLUMN in item and MODEL_CODE_COLUMN in item and item[MODEL_CODE_COLUMN]
-        }
-        if not ground_truth_models:
-            raise ValueError(
-                f"No models found in dataset. Check PROBLEM_NAME_COLUMN ('{PROBLEM_NAME_COLUMN}') and MODEL_CODE_COLUMN ('{MODEL_CODE_COLUMN}').")
-        print(f"eval.py: Loaded {len(ground_truth_models)} ground-truth models from Hugging Face.", flush=True)
-    except Exception as e:
-        print(f"eval.py: CRITICAL ERROR - Failed to load ground-truth dataset: {e}", flush=True)
-        with open(summary_file_path, "w") as f:
-            f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{DATASET_NAME}'.\nError: {e}\n")
-        return 1  # Indicate failure
-    # Statistics
-    total_submitted_models = 0
-    models_ran_successfully = 0
-    gt_models_found = 0
-    consistency_checks_passed = 0
-    objective_checks_passed = 0  # Includes "NO_OBJECTIVE_DEFINED" as a pass
-    with open(summary_file_path, "w") as summary_f:
-        summary_f.write(f"Evaluation Summary for Submission: {submission_name}\n")
-        summary_f.write(
-            f"Ground-Truth Dataset: {DATASET_NAME}\n")
-        summary_f.write("-" * 30 + "\n")
-        submitted_model_files = list(submission_path.glob('*.py'))  # Assuming Python models
-        if not submitted_model_files:
-            summary_f.write("No .py model files found in submission.\n")
-            print("eval.py: No .py model files found in submission.", flush=True)
-            return 0  # No models to evaluate, but script ran.
-        for model_file_path in submitted_model_files:
-            total_submitted_models += 1
-            problem_name = model_file_path.stem  # Filename without .py extension
-            print(f"\nProcessing submitted model: {model_file_path.name}", flush=True)
-            summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
-            # 1. Run the submitted model to get its solution
-            summary_f.write("  1. Running submitted model...\n")
-            generated_solution = run_instance(str(model_file_path))
-            if generated_solution is None:
-                summary_f.write("    - FAILED to run or get valid JSON solution from submitted model.\n")
-                continue  # Move to the next model
-            models_ran_successfully += 1
-            summary_f.write(f"    - SUCCESS: Got solution. (e.g., {str(list(generated_solution.items())[:2])}...)\n")
-            # 2. Find corresponding ground-truth model
-            summary_f.write(f"  2. Checking against ground-truth for '{problem_name}'...\n")
-            if problem_name not in ground_truth_models:
-                summary_f.write(f"    - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
-                print(f"  WARNING: Ground-truth for '{problem_name}' not found in dataset.", flush=True)
-                continue
-            gt_models_found += 1
-            ground_truth_script_content = ground_truth_models[problem_name]
-            summary_f.write("    - SUCCESS: Found ground-truth model.\n")
-            # 3. Modify ground-truth script with solution and run self-consistency check
-            summary_f.write("  3. Performing self-consistency check on ground-truth model...\n")
-            modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
-            consistency_passed_this_model = False
-            objective_passed_this_model = False
-            try:
-                with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8') as tmp_file:
-                    tmp_file.write(modified_gt_script)
-                    tmp_file_path_str = tmp_file.name
-                # Run the modified ground-truth script
-                gt_check_result = subprocess.run(
-                    [sys.executable, tmp_file_path_str],
-                    capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT
-                )
-                os.unlink(tmp_file_path_str)  # Clean up temp file
-                # 4. Parse output of modified ground-truth
-                gt_stdout = gt_check_result.stdout
-                gt_stderr = gt_check_result.stderr
-                # summary_f.write(f"    Modified GT STDOUT: {gt_stdout[:500]}...\n") # For debugging
-                if gt_stderr:
-                    summary_f.write(f"    Modified GT STDERR: {gt_stderr[:500]}...\n")
-                if "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=SUCCESS" in gt_stdout:
-                    summary_f.write("    - CONSISTENCY: PASSED\n")
-                    consistency_checks_passed += 1
-                    consistency_passed_this_model = True
-                elif "EVAL_OUTPUT: CONSISTENCY_CHECK_RESULT=UNSATISFIABLE" in gt_stdout:
-                    summary_f.write("    - CONSISTENCY: FAILED (Model became unsatisfiable)\n")
-                else:
-                    summary_f.write("    - CONSISTENCY: FAILED (Could not determine consistency from output)\n")
-                if "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CONSISTENT" in gt_stdout or \
-                        "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=NO_OBJECTIVE_DEFINED" in gt_stdout:
-                    summary_f.write("    - OBJECTIVE: PASSED (Consistent or no objective)\n")
-                    objective_checks_passed += 1
-                    objective_passed_this_model = True
-                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=CHANGED" in gt_stdout:
-                    summary_f.write(f"    - OBJECTIVE: FAILED (Value changed)\n")
-                elif "EVAL_OUTPUT: OBJECTIVE_CHECK_RESULT=ERROR_ACCESSING_FINAL_OBJECTIVE" in gt_stdout:
-                    summary_f.write(f"    - OBJECTIVE: FAILED (Error accessing final objective)\n")
-                else:
-                    summary_f.write("    - OBJECTIVE: FAILED (Could not determine objective consistency from output)\n")
-            except subprocess.TimeoutExpired:
-                summary_f.write(
-                    f"    - SELF-CONSISTENCY CHECK: FAILED (Timeout >{SCRIPT_EXECUTION_TIMEOUT}s running modified ground-truth)\n")
-                print(f"  ERROR: Timeout running modified GT for {problem_name}", flush=True)
-            except Exception as e_gt_run:
-                summary_f.write(
-                    f"    - SELF-CONSISTENCY CHECK: FAILED (Error running modified ground-truth: {e_gt_run})\n")
-                print(f"  ERROR: Running modified GT for {problem_name}: {e_gt_run}", flush=True)
-        # Final statistics
-        summary_f.write("\n" + "=" * 30 + "\n")
-        summary_f.write("Overall Evaluation Statistics:\n")
-        summary_f.write(f"  Total Submitted Models Parsed: {total_submitted_models}\n")
-        summary_f.write(
-            f"  Models That Ran Successfully (produced solution): {models_ran_successfully}/{total_submitted_models}\n")
-        summary_f.write(
-            f"  Corresponding Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully} (of those that ran)\n")
-        summary_f.write(f"  Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
-        summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
-        # Define an overall score, e.g. number of models that passed both checks against found GT
-        fully_passed_models = 0
-        # This needs re-evaluation logic, but for now let's say a score is consistency+objective passes
-        # This simple score is just the sum of passes, could be more nuanced
-        overall_score = consistency_checks_passed + objective_checks_passed
-        summary_f.write(f"\nScore: {overall_score} (Raw sum of passed checks)\n")  # For Gradio app to parse
-    elapsed_time = time.time() - start_time
-    print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
-    print(f"eval.py: Summary written to {summary_file_path}", flush=True)
-    return 0  # Success
-if __name__ == "__main__":
-    if len(sys.argv) < 3:
-        print("Usage: python eval.py <path_to_submitted_directory> <path_to_results_base_directory>")
-        print("Example: python eval.py ./submissions/my_run ./results")
-        sys.exit(1)
-    submission_dir = sys.argv[1]
-    results_base_dir = sys.argv[2]
-    # Simple check if submission_dir exists
-    if not Path(submission_dir).is_dir():
-        print(f"Error: Submission directory '{submission_dir}' not found or not a directory.")
-        sys.exit(1)
-    exit_code = main(submission_dir, results_base_dir)
-    sys.exit(exit_code)

src/config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# File and directory paths
+EVAL_SCRIPT_PATH = "src/eval.py"
+LOCAL_TEMP_SUBMISSIONS_DIR = "../temp_submissions_app"
+# Hugging Face Dataset Configuration
+DATASET_REPO_ID = "kostis-init/my-storage"
+DS_SUBMISSIONS_PATH = "submissions"
+DS_RESULTS_PATH = "results"
+# leaderboard
+LDB_COLS = ["Submission Name", "Execution (%)", "Consistency (%)", "Final Solution Accuracy (%)", "# of Models submitted"]

src/eval.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# eval.py
+import sys
+import os
+import time
+import json
+import subprocess
+import tempfile
+from pathlib import Path
+from datasets import load_dataset  # Hugging Face datasets library
+from huggingface_hub import HfApi, hf_hub_download, snapshot_download # For user data dataset
+from huggingface_hub.utils import RepositoryNotFoundError
+# --- Configuration ---
+GT_DATASET_NAME = "kostis-init/CP-Bench"
+# Column names in the Hugging Face dataset for problem identifier and model script
+GT_PROBLEM_NAME_COLUMN = "id"
+GT_MODEL_CODE_COLUMN = "model"
+# Timeout for running individual model scripts (both generated and modified ground-truth)
+SCRIPT_EXECUTION_TIMEOUT = 60  # seconds
+"""Handles evaluation of submissions."""
+import os
+import sys
+import subprocess
+import threading
+from pathlib import Path
+from src.config import EVAL_SCRIPT_PATH, DATASET_REPO_ID, DS_RESULTS_PATH
+def run_evaluation(submission_path):
+    if not Path(EVAL_SCRIPT_PATH).exists():
+        print(f"ERROR: Eval script '{EVAL_SCRIPT_PATH}' not found")
+        return
+    print(f"Starting evaluation for: {submission_path}")
+    command = [
+        sys.executable,
+        EVAL_SCRIPT_PATH,
+        DATASET_REPO_ID,
+        submission_path,
+        DS_RESULTS_PATH
+    ]
+    try:
+        process = subprocess.run(
+            command,
+            capture_output=True,
+            text=True,
+            check=False,
+            timeout=600,
+            encoding='utf-8',
+        )
+        if process.returncode == 0:
+            print(f"Evaluation successful for: {submission_path}")
+        else:
+            print(f"Evaluation failed for: {submission_path}")
+            print(f"STDERR: {process.stderr}")
+    except subprocess.TimeoutExpired:
+        print(f"Evaluation timed out for: {submission_path}")
+    except Exception as e:
+        print(f"Error running evaluation: {e}")
+    print(f"Evaluation process complete for: {submission_path}")
+def start_background_evaluation(submission_path):
+    """Start evaluation in a background thread."""
+    thread = threading.Thread(
+        target=lambda: run_evaluation(submission_path),
+        daemon=True
+    )
+    thread.start()
+    return True
+def extract_json_from_string(text_output: str):
+    """
+    Attempts to find and parse the first valid JSON object or array from a string.
+    Handles cases where JSON is preceded or followed by non-JSON text.
+    """
+    idx = 0
+    while idx < len(text_output):
+        # Find the next potential start of a JSON structure
+        start_brace = text_output.find('{', idx)
+        start_bracket = text_output.find('[', idx)
+        if start_brace == -1 and start_bracket == -1:
+            # No more '{' or '[' found in the rest of the string
+            return None
+        # Determine the actual starting character for this attempt
+        if start_brace != -1 and (start_bracket == -1 or start_brace < start_bracket):
+            json_start_index = start_brace
+        else:
+            json_start_index = start_bracket
+        potential_json_segment = text_output[json_start_index:]
+        try:
+            # Use raw_decode to parse the first valid JSON object from the segment
+            decoder = json.JSONDecoder()
+            json_obj, end_index_in_segment = decoder.raw_decode(potential_json_segment)
+            # Successfully parsed a JSON object
+            return json_obj
+        except json.JSONDecodeError:
+            # This segment (starting at json_start_index) wasn't a valid JSON.
+            # Advance the search index past the character that caused the current attempt.
+            idx = json_start_index + 1
+    return None  # No valid JSON found in the entire string
+def run_instance(instance_path_str: str,
+                 timeout: int = SCRIPT_EXECUTION_TIMEOUT):  # SCRIPT_EXECUTION_TIMEOUT should be defined
+    """Run the instance file and robustly capture the JSON output."""
+    command = [sys.executable, instance_path_str]
+    instance_name = Path(instance_path_str).name
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, timeout=timeout, encoding='utf-8',
+                                errors='replace')
+        # Check return code first
+        if result.returncode != 0:
+            # Log stderr for debugging if the script itself failed
+            error_message = result.stderr[:500].strip() if result.stderr else "<No stderr>"
+            print(f"  ERROR: Running {instance_name} (Return Code: {result.returncode}): {error_message}", flush=True)
+            return None
+        # Attempt to extract JSON from stdout
+        stdout_text = result.stdout
+        if not stdout_text or not stdout_text.strip():
+            print(f"  ERROR: No stdout from {instance_name}.", flush=True)
+            return None
+        solution = extract_json_from_string(stdout_text)
+        if solution is None:
+            # Be more verbose if JSON extraction fails
+            abbreviated_stdout = stdout_text.replace('\n', '\\n')[:300]  # Show newlines as \n for brevity
+            print(
+                f"  ERROR: Could not extract valid JSON from {instance_name}. Raw stdout (abbreviated): '{abbreviated_stdout}...'",
+                flush=True)
+            return None
+        return solution
+    except subprocess.TimeoutExpired:
+        print(f"  ERROR: Timeout running {instance_name} (>{timeout}s)", flush=True)
+        return None
+    except Exception as e:
+        print(f"  ERROR: Unexpected error running {instance_name}: {e}", flush=True)
+        return None
+def add_constraints_as_string(solution):
+    """Generate constraints as a string to be added to the original script."""
+    constraints = ""
+    if solution:  # Ensure solution is not None
+        for key, value in solution.items():
+            # Basic escaping for string values if they occur, though typically solutions are numeric/boolean
+            if isinstance(value, str):
+                constraints += f"\nmodel += ({key} == \"{value}\")"
+            else:
+                constraints += f"\nmodel += ({key} == {value})"
+    return constraints
+def get_modified_script(script_content, solution):
+    """Add constraints to the script content and self-consistency checks."""
+    constraints_str = add_constraints_as_string(solution)
+    modified_script = f"{script_content}\n{constraints_str}"
+    modified_script += """
+# Print the absolute path of the current directory along with the script name
+import os
+print(os.path.abspath(__file__))
+# Keep old objective
+old_objective = None
+if hasattr(model, 'objective_is_min') and model.objective_is_min is not None:
+    old_objective = model.objective_value()
+# Check self-consistency
+if not model.solve():
+    print('ERROR: The model is unsatisfiable with the self-consistency constraints')
+else:
+    print('SUCCESS: Model is consistent')
+# Check if the objective value is the same
+if old_objective is None:
+    print('SUCCESS: No objective defined')
+elif model.objective_value() != old_objective:
+    print('ERROR: The objective value has changed')
+else:
+    print('SUCCESS: Objective value is consistent')
+"""
+    return modified_script
+# --- Main Evaluation Logic ---
+def main(
+        user_dataset_repo_id: str,
+        submission_path_in_dataset: str,  # e.g., "submissions/uploaded_dir_name"
+        results_base_path_in_dataset: str  # e.g., "results"
+):
+    start_time = time.time()
+    # Infer submission name for logging and result path generation
+    submission_name_for_files = Path(submission_path_in_dataset).name
+    print(f"eval.py: Starting evaluation for submission: '{submission_name_for_files}'", flush=True)
+    print(f"  User Data Repo: {user_dataset_repo_id}", flush=True)
+    print(f"  Submission to download from: {submission_path_in_dataset}", flush=True)
+    print(f"  Results to upload to: {results_base_path_in_dataset}/{submission_name_for_files}", flush=True)
+    hf_api = HfApi()  # Will use HF_TOKEN from environment
+    # Create a top-level temporary directory for all operations for this eval run
+    with tempfile.TemporaryDirectory(prefix="eval_run_") as top_level_temp_dir_str:
+        top_level_temp_dir = Path(top_level_temp_dir_str)
+        local_submission_dir = top_level_temp_dir / "submissions"
+        local_result_dir_for_upload = top_level_temp_dir / "results"
+        os.makedirs(local_submission_dir, exist_ok=True)
+        os.makedirs(local_result_dir_for_upload, exist_ok=True)
+        # Path for the summary file within the local temporary result directory
+        summary_file_path = local_result_dir_for_upload / "summary.txt"
+        # 1. Download submitted files from HF Dataset
+        print(f"  Downloading submission files from '{submission_path_in_dataset}' to '{local_submission_dir}'...",
+              flush=True)
+        try:
+            # Download the relevant submission files
+            snapshot_download(
+                repo_id=user_dataset_repo_id,
+                repo_type="dataset",
+                local_dir=local_submission_dir,
+                allow_patterns=[f"{submission_path_in_dataset}/*"],
+            )
+            print(f"  Downloaded submission files successfully.", flush=True)
+        except Exception as e_download:
+            print(f"  CRITICAL ERROR - Failed to download submission files: {e_download}", flush=True)
+            return 1
+        # 2. Load ground-truth dataset (remains the same)
+        print(f"  Loading ground-truth dataset '{GT_DATASET_NAME}'...", flush=True)
+        try:
+            gt_dataset = load_dataset(GT_DATASET_NAME, split="train", trust_remote_code=True)
+            ground_truth_models = {
+                item[GT_PROBLEM_NAME_COLUMN]: item[GT_MODEL_CODE_COLUMN]
+                for item in gt_dataset if
+                GT_PROBLEM_NAME_COLUMN in item and GT_MODEL_CODE_COLUMN in item and item[GT_MODEL_CODE_COLUMN]
+            }
+            if not ground_truth_models: raise ValueError("No models in GT dataset.")
+            print(f"  Loaded {len(ground_truth_models)} ground-truth models.", flush=True)
+        except Exception as e_gt:
+            print(f"  CRITICAL ERROR - Failed to load ground-truth dataset: {e_gt}", flush=True)
+            with open(summary_file_path, "w") as f:
+                f.write(f"CRITICAL ERROR: Failed to load ground-truth dataset '{GT_DATASET_NAME}'.\nError: {e_gt}\n")
+            # (Attempt to upload error summary)
+            return 1
+        # Statistics
+        total_submitted_models = 0
+        models_ran_successfully = 0
+        consistency_checks_passed = 0
+        objective_checks_passed = 0
+        all_checks_passed = 0
+        gt_models_found = 0
+        with open(summary_file_path, "w", encoding="utf-8") as summary_f:
+            summary_f.write(f"Evaluation Summary for Submission: {submission_name_for_files}\n")
+            summary_f.write(f"User Data Repo: {user_dataset_repo_id}\n")
+            summary_f.write(f"Submission Path in Dataset: {submission_path_in_dataset}\n")
+            summary_f.write(f"Ground-Truth Dataset: {GT_DATASET_NAME}\n")
+            summary_f.write("-" * 30 + "\n")
+            # Iterate through downloaded submitted models
+            submitted_model_files = list((local_submission_dir / submission_path_in_dataset).rglob('*.py'))
+            if not submitted_model_files:
+                summary_f.write("No .py model files found in downloaded submission.\n")
+                print("  No .py model files found in downloaded submission.", flush=True)
+            for model_file_path in submitted_model_files:
+                total_submitted_models += 1
+                problem_name = model_file_path.stem
+                print(f"\n  Processing downloaded model: {model_file_path.name}", flush=True)
+                summary_f.write(f"\n--- Model: {model_file_path.name} ---\n")
+                summary_f.write("    1. Running submitted model...\n")
+                generated_solution = run_instance(str(model_file_path))
+                if generated_solution is None:
+                    summary_f.write("      - FAILED to run or get valid JSON solution from submitted model.\n")
+                    continue
+                models_ran_successfully += 1
+                summary_f.write(f"      - SUCCESS: Got solution.\n")
+                summary_f.write(f"    2. Checking against ground-truth for '{problem_name}'...\n")
+                if problem_name not in ground_truth_models:
+                    summary_f.write(f"      - FAILED: Ground-truth model for '{problem_name}' not found in dataset.\n")
+                    continue
+                gt_models_found += 1
+                ground_truth_script_content = ground_truth_models[problem_name]
+                summary_f.write("      - SUCCESS: Found ground-truth model.\n")
+                summary_f.write("    3. Performing self-consistency check on ground-truth model...\n")
+                modified_gt_script = get_modified_script(ground_truth_script_content, generated_solution)
+                try:
+                    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, encoding='utf-8',
+                                                     dir=top_level_temp_dir) as tmp_file:
+                        tmp_file.write(modified_gt_script)
+                        tmp_file_path_str = tmp_file.name
+                    gt_check_result = subprocess.run(
+                        [sys.executable, tmp_file_path_str],
+                        capture_output=True, text=True, timeout=SCRIPT_EXECUTION_TIMEOUT, encoding='utf-8',
+                    )
+                    os.unlink(tmp_file_path_str)
+                    gt_stdout = gt_check_result.stdout
+                    # ... (parse EVAL_OUTPUT tags for consistency and objective)
+                    if "SUCCESS: Model is consistent" in gt_stdout:
+                        summary_f.write("      - CONSISTENCY: PASSED\n")
+                        consistency_checks_passed += 1
+                    else:
+                        summary_f.write(
+                            "      - CONSISTENCY: FAILED (Details in logs or stdout)\n")
+                    if "SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout:
+                        summary_f.write("      - OBJECTIVE: PASSED\n")
+                        objective_checks_passed += 1
+                    else:
+                        summary_f.write("      - OBJECTIVE: FAILED (Details in logs or stdout)\n")
+                    if "SUCCESS: Model is consistent" in gt_stdout and ("SUCCESS: No objective defined" in gt_stdout or "SUCCESS: Objective value is consistent" in gt_stdout):
+                        summary_f.write("      - SELF-CONSISTENCY CHECK: PASSED fully\n")
+                        all_checks_passed += 1
+                except Exception as e_gt_run:
+                    summary_f.write(f"      - SELF-CONSISTENCY CHECK: FAILED (Error: {e_gt_run})\n")
+            # Final statistics (write to summary_f)
+            summary_f.write("\n" + "=" * 30 + "\n")
+            summary_f.write("Overall Evaluation Statistics:\n")
+            summary_f.write(f"  Total Submitted Models Parsed: {total_submitted_models}\n")
+            summary_f.write(f"  Models That Ran Successfully: {models_ran_successfully}/{total_submitted_models}\n")
+            summary_f.write(f"  Ground-Truth Models Found: {gt_models_found}/{models_ran_successfully}\n")
+            summary_f.write(f"  Consistency Checks Passed: {consistency_checks_passed}/{gt_models_found}\n")
+            summary_f.write(f"  Objective Value Checks Passed: {objective_checks_passed}/{gt_models_found}\n")
+            summary_f.write("=" * 30 + "\n")
+            summary_f.write("Final Evaluation Summary:\n")
+            summary_f.write(f"  Execution perc: {models_ran_successfully / len(ground_truth_models) * 100:.2f}%\n")
+            summary_f.write(f"  Consistency perc: {consistency_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
+            summary_f.write(f"  Objective perc: {objective_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
+            summary_f.write(f"  Final Solution Accuracy perc: {all_checks_passed / len(ground_truth_models) * 100:.2f}%\n")
+            summary_f.write("-" * 30 + "\n")
+        # 4. Upload the entire local_result_dir_for_upload to HF Dataset
+        # This directory contains summary.txt and could contain other result files.
+        result_path_on_hub = f"{results_base_path_in_dataset}/{submission_name_for_files}"
+        print(f"  Uploading results from '{local_result_dir_for_upload}' to '{result_path_on_hub}' on dataset...",
+              flush=True)
+        try:
+            hf_api.upload_folder(
+                folder_path=str(local_result_dir_for_upload),
+                path_in_repo=result_path_on_hub,
+                repo_id=user_dataset_repo_id,
+                repo_type="dataset",
+                commit_message=f"Evaluation results for {submission_name_for_files}"
+            )
+            print("  Results uploaded successfully.", flush=True)
+        except Exception as e_upload:
+            print(f"  CRITICAL ERROR: Failed to upload results: {e_upload}", flush=True)
+            # The summary.txt was written locally, but upload failed.
+    elapsed_time = time.time() - start_time
+    print(f"eval.py: Evaluation finished in {elapsed_time:.2f} seconds.", flush=True)
+    return 0
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        print(
+            "Usage: python eval.py <user_dataset_repo_id> <submission_path_in_dataset> <results_base_path_in_dataset>")
+        print("Example: python eval.py your-username/my-storage submissions/run123 results")
+        sys.exit(1)
+    arg_user_dataset_repo_id = sys.argv[1]
+    arg_submission_path_in_dataset = sys.argv[2]
+    arg_results_base_path_in_dataset = sys.argv[3]
+    exit_code = main(arg_user_dataset_repo_id, arg_submission_path_in_dataset, arg_results_base_path_in_dataset)
+    sys.exit(exit_code)

src/hf_utils.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Utilities for interacting with the Hugging Face Hub."""
+import os
+import shutil
+from pathlib import Path
+import pandas as pd
+from huggingface_hub import HfApi, hf_hub_download, list_repo_files
+from huggingface_hub.utils import RepositoryNotFoundError, HFValidationError
+from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
+# Initialize HfApi
+try:
+    HF_API = HfApi()
+    print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
+except Exception as e:
+    print(f"Failed to initialize HfApi: {e}")
+    HF_API = None
+def load_leaderboard_data():
+    """Load leaderboard data from Hugging Face Dataset."""
+    if not HF_API:
+        return pd.DataFrame(columns=LDB_COLS)
+    leaderboard_entries = []
+    processed_result_dirs = set()
+    try:
+        # List all files in the results path of the dataset
+        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
+        # Find all summary files
+        summary_files = [
+            f for f in repo_files
+            if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
+        ]
+        summary_files.sort(reverse=True)
+        for file_path in summary_files:
+            dir_name = Path(file_path).parent.name
+            if dir_name in processed_result_dirs:
+                continue
+            processed_result_dirs.add(dir_name)
+            entry = {LDB_COLS[0]: dir_name, LDB_COLS[1]: 'N/A', LDB_COLS[2]: 'N/A', LDB_COLS[3]: 'N/A', LDB_COLS[4]: 0}
+            # Download summary file
+            temp_dir = os.path.join("temp_hf_downloads", dir_name)
+            local_summary_path = hf_hub_download(
+                repo_id=DATASET_REPO_ID,
+                filename=file_path,
+                repo_type="dataset",
+                local_dir=temp_dir,
+            )
+            # Count files
+            files_in_result_dir = [
+                f for f in repo_files
+                if f.startswith(f"{DS_RESULTS_PATH}/{dir_name}/") and not f.endswith("/")
+            ]
+            # Parse score from summary
+            if Path(local_summary_path).exists():
+                with open(local_summary_path, "r", encoding="utf-8") as f:
+                    for line in f:
+                        if 'Execution perc' in line:
+                            entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
+                        if 'Consistency perc' in line:
+                            entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
+                        if 'Final Solution Accuracy' in line:
+                            entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
+                        if 'Total Submitted Models Parsed' in line:
+                            entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
+                os.remove(local_summary_path)
+            leaderboard_entries.append(entry)
+    except Exception as e:
+        print(f"Error loading leaderboard data: {e}")
+    finally:
+        # Clean up
+        if Path("temp_hf_downloads").exists():
+            shutil.rmtree("temp_hf_downloads", ignore_errors=True)
+    if not leaderboard_entries:
+        return pd.DataFrame(columns=LDB_COLS)
+    return pd.DataFrame(leaderboard_entries)
+def upload_submission(uploaded_files, dir_name):
+    """Upload submission to Hugging Face Dataset."""
+    if not HF_API:
+        return False, "Hugging Face API not initialized"
+    try:
+        submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
+        for file in uploaded_files:
+            file_name = os.path.basename(file.name)
+            HF_API.upload_file(
+                path_or_fileobj=file,
+                path_in_repo=f"{submission_path}/{file_name}",
+                repo_id=DATASET_REPO_ID,
+                repo_type="dataset",
+                commit_message=f"Upload submission: {dir_name}"
+            )
+        return True, submission_path
+    except Exception as e:
+        return False, f"Upload error: {str(e)}"
+def check_name_exists(submission_name):
+    if not HF_API:
+        return False
+    try:
+        repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
+        for file_path in repo_files:
+            if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
+                return True
+    except Exception as e:
+        print(f"Error checking name existence: {e}")
+    return False

src/ui.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import gradio as gr
+from pathlib import Path
+from src.hf_utils import load_leaderboard_data, upload_submission, check_name_exists
+from src.eval import start_background_evaluation
+def handle_upload(submission_name, uploaded_files, progress=gr.Progress()):
+    """Handle file upload and start evaluation."""
+    if not uploaded_files or len(uploaded_files) == 0:
+        return "No directory uploaded or directory is empty, please try again."
+    # normalize the submission name
+    submission_name = submission_name.strip().replace(" ", "_").lower()
+    # keep only alphanumeric characters and underscores, restrict to 30 characters
+    submission_name = "".join(
+        c for c in submission_name if c.isalnum() or c == "_"
+    )[:30]
+    if not submission_name or submission_name.strip() == "":
+        return "Submission name is required"
+    if check_name_exists(submission_name):
+        return f"Submission name '{submission_name}' already exists. Please choose a different name."
+    try:
+        progress(0.3, "Uploading to Hugging Face...")
+        # Upload the directory to Hugging Face
+        success, result = upload_submission(uploaded_files, submission_name)
+        if not success:
+            return f"Upload failed: {result}"
+        progress(0.7, "Starting evaluation...")
+        # Start evaluation
+        start_background_evaluation(result)
+        progress(1.0, "Process complete")
+        return f"Upload complete. Evaluation started for: {submission_name}. Refresh the leaderboard to see results. Do not worry if the leaderboard does not update immediately; it may take some time for the results to appear."
+    except Exception as e:
+        return f"Error processing upload: {str(e)}"
+def create_ui():
+    """Create and return Gradio UI."""
+    with gr.Blocks(title="CP-Bench Leaderboard") as demo:
+        gr.Markdown("# CP-Bench Leaderboard")
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("## 📤 Upload Submission")
+                submission_name = gr.Textbox(
+                    label="Submission Name (required)",
+                    placeholder="Enter a unique name for your submission",
+                    interactive=True,
+                    info="This name will appear on the leaderboard"
+                )
+                upload_button = gr.UploadButton("Click to Upload Directory", file_count="directory")
+                status_box = gr.Textbox(label="Status", interactive=False)
+            with gr.Column(scale=3):
+                gr.Markdown("## 🏆 Results Leaderboard")
+                leaderboard = gr.DataFrame(value=load_leaderboard_data, label="Leaderboard", interactive=False)
+                refresh_button = gr.Button("🔄 Refresh Leaderboard")
+        # Event handlers
+        upload_button.upload(
+            fn=handle_upload,
+            inputs=[submission_name, upload_button],
+            outputs=[status_box],
+            show_progress="full",
+        )
+        refresh_button.click(
+            fn=load_leaderboard_data,
+            inputs=None,
+            outputs=[leaderboard]
+        )
+    return demo

src/utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import os
+from src.config import LOCAL_TEMP_SUBMISSIONS_DIR
+def setup_directories():
+    os.makedirs(LOCAL_TEMP_SUBMISSIONS_DIR, exist_ok=True)