Spaces:

kostis-init
/

CP-Bench-Leaderboard

Running

App Files Files Community

CP-Bench-Leaderboard / app.py

kostis-init

replace with simpler app

b5712a3 29 days ago

raw

history blame

10.3 kB

	import gradio as gr
	import pandas as pd
	import os
	import shutil
	from pathlib import Path
	import subprocess # For running eval.py
	import time
	import threading # For background tasks
	import sys

	# --- Configuration ---
	SUBMISSIONS_DIR = "submissions"
	RESULTS_DIR = "results"
	EVAL_SCRIPT_PATH = "eval.py"


	# --- Helper Functions ---

	def setup_directories():
	"""Creates the submissions and results directories if they don't exist."""
	os.makedirs(SUBMISSIONS_DIR, exist_ok=True)
	os.makedirs(RESULTS_DIR, exist_ok=True)
	if not os.listdir(RESULTS_DIR): # Add a placeholder if results is empty
	initial_result_demo_path = Path(RESULTS_DIR) / "initial_example_result"
	if not initial_result_demo_path.exists():
	os.makedirs(initial_result_demo_path, exist_ok=True)
	with open(initial_result_demo_path / "summary.txt", "w") as f:
	f.write("This is a placeholder initial result.\nScore: 0\n")
	print(f"Created a sample directory in '{RESULTS_DIR}' for demonstration.")


	def load_leaderboard_data():
	"""
	Scans the RESULTS_DIR for subdirectories and returns a DataFrame.
	Each subdirectory name is an entry. Tries to parse a 'Score' from 'summary.txt'.
	"""
	if not os.path.exists(RESULTS_DIR):
	return pd.DataFrame(columns=["Result Directory", "Score", "Files"])

	result_dirs = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(Path(RESULTS_DIR) / d)]

	leaderboard_entries = []
	# Sort by modification time of the directory (newest first)
	# This requires getting mtime for each directory.
	sorted_result_dirs = sorted(
	result_dirs,
	key=lambda d: (Path(RESULTS_DIR) / d).stat().st_mtime,
	reverse=True
	)

	for dir_name in sorted_result_dirs:
	entry = {"Result Directory": dir_name, "Score": "N/A", "Files": 0}
	result_dir_path = Path(RESULTS_DIR) / dir_name

	try:
	entry["Files"] = len([f for f in os.listdir(result_dir_path) if os.path.isfile(result_dir_path / f)])
	except Exception:
	pass # Directory might have been removed during scan

	summary_file = result_dir_path / "summary.txt"
	if summary_file.exists():
	try:
	with open(summary_file, "r") as f:
	for line in f:
	if line.lower().startswith("score:"):
	entry["Score"] = line.split(":", 1)[1].strip()
	break
	except Exception as e:
	print(f"Error parsing summary for {dir_name}: {e}")

	leaderboard_entries.append(entry)

	if not leaderboard_entries:
	return pd.DataFrame(columns=["Result Directory", "Score", "Files"])

	return pd.DataFrame(leaderboard_entries)


	def run_evaluation_in_background(submission_dir_path_str: str, results_dir_str: str, submission_name_for_log: str):
	"""
	This function runs eval.py in a subprocess. It's intended to be run in a separate thread.
	Outputs from eval.py will go to the console where app.py is running.
	"""
	print(
	f"BACKGROUND THREAD: Starting evaluation for '{submission_name_for_log}' using path '{submission_dir_path_str}'...")

	if not Path(EVAL_SCRIPT_PATH).exists():
	print(
	f"BACKGROUND THREAD: CRITICAL ERROR - Evaluation script '{EVAL_SCRIPT_PATH}' not found. Eval aborted for '{submission_name_for_log}'.")
	return

	command = [sys.executable, EVAL_SCRIPT_PATH, submission_dir_path_str, results_dir_str]

	try:
	# Using subprocess.run which is simpler for blocking calls within this thread
	process = subprocess.run(
	command,
	capture_output=True,
	text=True,
	check=False, # Handle non-zero exit codes manually
	timeout=300 # 5-minute timeout for the evaluation script
	)

	eval_output = process.stdout.strip()
	eval_error = process.stderr.strip()

	print(
	f"--- BACKGROUND Eval STDOUT ({submission_name_for_log}) ---\n{eval_output if eval_output else '<No stdout>'}")
	if eval_error: # Only print stderr if it's not empty
	print(f"--- BACKGROUND Eval STDERR ({submission_name_for_log}) ---\n{eval_error}")

	if process.returncode == 0:
	print(f"BACKGROUND THREAD: Evaluation successful for '{submission_name_for_log}'.")
	else:
	print(
	f"BACKGROUND THREAD: Evaluation FAILED for '{submission_name_for_log}'. Script exit code: {process.returncode}")

	except subprocess.TimeoutExpired:
	print(f"BACKGROUND THREAD: Evaluation for '{submission_name_for_log}' TIMED OUT after 5 minutes.")
	except FileNotFoundError: # This means 'python' or EVAL_SCRIPT_PATH could not be found by subprocess
	print(
	f"BACKGROUND THREAD: FileNotFoundError - Could not execute command. Ensure 'python' is in PATH and '{EVAL_SCRIPT_PATH}' is correct for '{submission_name_for_log}'.")
	except Exception as e:
	print(
	f"BACKGROUND THREAD: An unexpected error occurred during evaluation for '{submission_name_for_log}': {str(e)}")

	print(f"BACKGROUND THREAD: Finished evaluation attempt for '{submission_name_for_log}'.")


	def handle_upload_and_kickoff_eval(uploaded_files_list, progress=gr.Progress(track_tqdm=True)):
	"""
	Handles directory upload, saves files, and starts eval.py in a background thread.
	Yields a status message for the UI. The leaderboard updates separately.
	"""
	yield "Processing upload..." # Initial status

	if not uploaded_files_list:
	yield "No directory uploaded. Please select a directory."
	return

	try:
	# Determine original uploaded directory name
	first_temp_file_path = Path(uploaded_files_list[0].name)
	original_uploaded_dir_name = first_temp_file_path.parent.name

	submission_dir_path = Path(SUBMISSIONS_DIR) / original_uploaded_dir_name

	# Handle potential name collision
	if submission_dir_path.exists():
	timestamp = time.strftime("%Y%m%d-%H%M%S")
	descriptive_name_for_log_and_status = f"{original_uploaded_dir_name}_{timestamp}"
	submission_dir_path = Path(SUBMISSIONS_DIR) / descriptive_name_for_log_and_status
	status_update_msg = f"Directory '{original_uploaded_dir_name}' existed. Saving as '{descriptive_name_for_log_and_status}'."
	original_uploaded_dir_name = descriptive_name_for_log_and_status # Use new name for logging
	else:
	descriptive_name_for_log_and_status = original_uploaded_dir_name
	status_update_msg = f"Copying files for '{descriptive_name_for_log_and_status}'..."

	os.makedirs(submission_dir_path, exist_ok=True)
	progress(0.1, desc=status_update_msg)

	for i, temp_file_obj in enumerate(progress.tqdm(uploaded_files_list, desc="Copying files")):
	temp_file_path = Path(temp_file_obj.name)
	file_name_in_dir = temp_file_path.name
	target_file_path = submission_dir_path / file_name_in_dir
	shutil.copy(str(temp_file_path), str(target_file_path))

	upload_completion_msg = f"Upload of '{descriptive_name_for_log_and_status}' complete."
	progress(0.8, desc=upload_completion_msg)

	except Exception as e:
	yield f"Error during upload: {str(e)}"
	return

	# --- Start evaluation in a background thread ---
	if not Path(EVAL_SCRIPT_PATH).exists():
	yield f"{upload_completion_msg} BUT CRITICAL ERROR: Evaluation script '{EVAL_SCRIPT_PATH}' not found. Evaluation cannot be started."
	return

	# Ensure paths passed to thread are absolute strings, good practice for threads.
	abs_submission_path = str(submission_dir_path.resolve())
	abs_results_path = str(Path(RESULTS_DIR).resolve())

	eval_thread = threading.Thread(
	target=run_evaluation_in_background,
	args=(abs_submission_path, abs_results_path, descriptive_name_for_log_and_status),
	daemon=True # Set as daemon so it exits when main app exits
	)
	eval_thread.start()

	final_status_msg = (
	f"{upload_completion_msg} Evaluation for '{descriptive_name_for_log_and_status}' has started in the background. "
	"The leaderboard will auto-refresh (or use manual refresh)."
	)
	progress(1.0, desc="Background evaluation initiated.")
	yield final_status_msg


	# --- Create Directories ---
	setup_directories()

	# --- Gradio App Definition ---
	with gr.Blocks(title="Background Submission, Evaluation, and Leaderboard") as demo:
	gr.Markdown("# Background Submission, Evaluation & Results")
	gr.Markdown(
	f"Upload submissions (directories) to '{SUBMISSIONS_DIR}'. "
	f"The evaluation script (`{EVAL_SCRIPT_PATH}`) will process them in the background. "
	f"Results appear in '{RESULTS_DIR}'. The leaderboard auto-refreshes."
	)

	with gr.Row():
	with gr.Column(scale=1): # Upload Column
	gr.Markdown("## 📤 Upload & Evaluate Submission")
	upload_button = gr.UploadButton(
	"Click to Upload Directory for Evaluation",
	file_count="directory",
	)
	upload_status_textbox = gr.Textbox(label="Current Status", interactive=False, lines=4)

	with gr.Column(scale=2): # Leaderboard Column
	gr.Markdown("## 🏆 Results Leaderboard")
	leaderboard_df_component = gr.DataFrame(
	value=load_leaderboard_data, # Load initial data
	label="Leaderboard (auto-refreshes)",
	interactive=False,
	# every=20 # Auto-refresh leaderboard data every 20 seconds
	)
	refresh_leaderboard_button = gr.Button("🔄 Refresh Leaderboard Manually")

	# --- Event Handlers ---
	upload_button.upload(
	fn=handle_upload_and_kickoff_eval,
	inputs=[upload_button],
	outputs=[upload_status_textbox], # Only one output now for the status message
	show_progress="full"
	)

	refresh_leaderboard_button.click(
	fn=load_leaderboard_data,
	inputs=None,
	outputs=[leaderboard_df_component]
	)

	if __name__ == "__main__":
	demo.queue().launch()