Refactor evaluation logic: streamline user_eval.py, update evaluation script references, and clean up eval.py
70cc330
import json | |
import gradio as gr | |
from pathlib import Path | |
from src.config import SUPPORTED_FRAMEWORKS | |
from src.hf_utils import load_leaderboard_data, upload_submission, check_name_exists | |
from src.eval import start_background_evaluation | |
def handle_upload(submission_name, uploaded_file, report_file, model_framework, progress=gr.Progress()): | |
"""Handle file upload and start evaluation.""" | |
if model_framework not in SUPPORTED_FRAMEWORKS: | |
return f"Unsupported modelling framework: {model_framework}. Supported frameworks are: {', '.join(SUPPORTED_FRAMEWORKS)}" | |
if not uploaded_file: | |
return "No file uploaded. Please upload a valid submission file." | |
if report_file and not report_file.name.endswith(".pdf"): | |
return "Invalid report format. Please upload a PDF file." | |
# normalize the submission name | |
submission_name = submission_name.strip().replace(" ", "_").lower() | |
# keep only alphanumeric characters and underscores, restrict to 30 characters | |
submission_name = "".join( | |
c for c in submission_name if c.isalnum() or c == "_" | |
)[:30] | |
if not submission_name or submission_name.strip() == "": | |
return "Submission name is required" | |
if check_name_exists(submission_name): | |
return f"Submission name '{submission_name}' already exists. Please choose a different name." | |
try: | |
progress(0.3, "Uploading to Hugging Face...") | |
# Check if the file is a valid JSONL file | |
if not uploaded_file.name.endswith(".jsonl"): | |
return "Invalid file format. Please upload a .jsonl file." | |
# Check that the keys in the JSONL file are correct ('id' and 'model') | |
with open(uploaded_file.name, "r") as file: | |
found_one = False | |
for line in file: | |
found_one = True | |
json_object = json.loads(line) | |
if not all(key in json_object for key in ["id", "model"]): | |
return "Invalid content. Each line must contain 'id' and 'model' keys." | |
if not found_one: | |
return "Empty file. Please upload a valid JSONL file." | |
success, result = upload_submission(uploaded_file, submission_name, report_file, model_framework) | |
if not success: | |
return f"Upload failed: {result}" | |
progress(0.7, "Starting evaluation...") | |
# Start evaluation | |
start_background_evaluation(result) | |
progress(1.0, "Process complete") | |
return ( | |
f"β Submission '{submission_name}' uploaded successfully!\n" | |
f"Do not worry if the leaderboard does not update immediately; " | |
f"it may take some time for the results to appear (around 5-10 minutes). " | |
f"Feel free to close the tab and check back later.") | |
except Exception as e: | |
return f"Error processing upload: {str(e)}" | |
def create_ui(): | |
"""Create and return Gradio UI.""" | |
with gr.Blocks(title="Welcome to the CP-Bench leaderboard!") as demo: | |
gr.Markdown("# CP-Bench Leaderboard") | |
gr.Markdown( | |
"This leaderboard is designed to evaluate LLM-generated constraint models for the problems " | |
"in the [CP-Bench](https://huggingface.co/datasets/kostis-init/CP-Bench) dataset." | |
"\n\n" | |
"## How to Submit\n" | |
"1. **Name your submission**: Choose a unique name for your submission (e.g., `my_cool_submission`). " | |
"This name will be used to identify your submission on the leaderboard.\n" | |
"2. **Select the modelling framework**: Indicate which modelling framework your submission uses (e.g., MiniZinc, CPMpy, OR-Tools).\n" | |
"3. **Upload a PDF report**: This is optional, but we highly encourage you to upload a report " | |
" (in PDF format) describing your approach. As this is an open competition, we want to avoid submissions " | |
" that just copy the models from the dataset. The report can be a short description of your approach, " | |
" the models you generated, and any other relevant information.\n" | |
"4. **Upload your submission**: Upload a **single** `.jsonl` file containing the generated models. " | |
" **Each line in the file should be a JSON object with two keys: `id` and `model`.**\n" | |
" * `id`: The ID of the problem exactly as it appears in the original dataset (e.g., `csplib__csplib_001_car_sequencing`).\n" | |
" * `model`: The generated model for the problem (as a string representing runnable code). Make sure that it eventually outputs the solution as a json with key(s) as described in the `decision_variables` entry and values as would be expected in the problem. This is part of the evaluation as well: unexpected keys, or value types are considered incorrect. This is because our automatic evaluation is based on the solution printed by the submitted models.\n" | |
" * An example submission file can be found [here](https://huggingface.co/spaces/kostis-init/CP-Bench-competition/blob/main/template_submission.jsonl).\n" | |
"\n To help you get started, we also provide a **template script [here](https://huggingface.co/spaces/kostis-init/CP-Bench-competition/blob/main/template.py)**. This script acts as a backbone, showing how to produce a simple, runnable submission for one of the problems. You can use it as a starting point for developing your own logic.\n" | |
"5. **Check the leaderboard**: After uploading, it may take a few minutes for a submission to be evaluated and appear on the leaderboard.\n" | |
"\n\n" | |
"## Important Notes\n" | |
"1. **Submission Name**: The submission name must be different from any existing submission names.\n" | |
"2. **File Format**: Ensure that the uploaded files are in the correct format. The submission file must be a `.jsonl` file, and the report must be a `pdf` file.\n" | |
"3. **Evaluation Script**: It is highly recommended to use the evaluation script provided [here](https://huggingface.co/spaces/kostis-init/CP-Bench-competition/blob/main/src/user_eval.py) to check your results before submission. You can run the script as follows:\n" | |
" ```bash\n" | |
" python user_eval.py --submission_file path/to/my/submission.jsonl --modelling_framework CPMpy\n" | |
" ```\n" | |
" This will evaluate your submission locally and print the results to the console.\n" | |
"4. **Modelling Frameworks**: Currently, the supported modelling frameworks are MiniZinc, CPMpy and OR-Tools. More frameworks can be added (feel free to submit pull requests).\n" | |
"\n\n" | |
"### If you have any questions or issues, feel free to reach out to us.\n" | |
"---\n" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## π€ Upload Submission") | |
submission_name = gr.Textbox( | |
label="Submission Name (required)", | |
placeholder="Enter a unique name for your submission", | |
interactive=True, | |
info="This name will appear on the leaderboard" | |
) | |
model_framework = gr.Dropdown( | |
label="Modelling Framework (required)", | |
choices=SUPPORTED_FRAMEWORKS, | |
value=None, | |
multiselect=False, | |
interactive=True, | |
info="Select the modelling framework used for your submission.", | |
allow_custom_value=False, | |
filterable=False, | |
) | |
with gr.Row(): | |
report_file = gr.File( | |
label="Upload PDF Report (optional, but recommended)", | |
file_types=[".pdf"], | |
file_count="single", | |
interactive=True, | |
) | |
submission_file = gr.File( | |
label="Upload Submission File (required, .jsonl)", | |
file_types=[".jsonl"], | |
file_count="single", | |
interactive=True, | |
) | |
upload_button = gr.Button("Click to Upload Submission") | |
status_box = gr.Textbox(label="Status", interactive=False) | |
with gr.Column(scale=2): | |
gr.Markdown("## π Results Leaderboard") | |
leaderboard = gr.DataFrame(value=load_leaderboard_data, interactive=False) | |
refresh_button = gr.Button("π Refresh Leaderboard") | |
# Event handlers | |
upload_button.click( | |
fn=handle_upload, | |
inputs=[submission_name, submission_file, report_file, model_framework], | |
outputs=[status_box], | |
show_progress="full", | |
) | |
refresh_button.click( | |
fn=load_leaderboard_data, | |
inputs=None, | |
outputs=[leaderboard] | |
) | |
gr.Markdown( | |
"### If you found our work useful, please consider citing our paper and dataset as follows:\n" | |
"```bibtex\n" | |
"@dataset{michailidis_2025_15592407,\n" | |
"author = {Michailidis, Kostis and Tsouros, Dimosthenis and Guns, Tias},\n" | |
"title = {CP-Bench},\n" | |
"month = jun,\n" | |
"year = 2025,\n" | |
"publisher = {Zenodo},\n" | |
"version = {1.0.0},\n" | |
"doi = {10.5281/zenodo.15592407},\n" | |
"url = {https://doi.org/10.5281/zenodo.15592407},\n" | |
"}" | |
) | |
return demo | |