CP-Bench-Leaderboard / src /hf_utils.py
kostis-init's picture
add minizinc
e67d561
raw
history blame
5.58 kB
"""Utilities for interacting with the Hugging Face Hub."""
import io
import json
import os
import shutil
from pathlib import Path
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download, list_repo_files
from src.config import DATASET_REPO_ID, DS_RESULTS_PATH, DS_SUBMISSIONS_PATH, LDB_COLS
# Initialize HfApi
try:
HF_API = HfApi()
print(f"Successfully initialized HfApi. Will use dataset repo: {DATASET_REPO_ID}")
except Exception as e:
print(f"Failed to initialize HfApi: {e}")
HF_API = None
def load_leaderboard_data():
"""Load leaderboard data from Hugging Face Dataset."""
if not HF_API:
return pd.DataFrame(columns=LDB_COLS)
leaderboard_entries = []
processed_result_dirs = set()
try:
# List all files in the results path of the dataset
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
# Find all summary files
summary_files = [
f for f in repo_files
if f.endswith("summary.txt") and f.startswith(DS_RESULTS_PATH + "/")
]
summary_files.sort(reverse=True)
submissions = [
f for f in repo_files
if f.endswith("submission.jsonl") and f.startswith(DS_SUBMISSIONS_PATH + "/")
]
# for file_path in summary_files:
for file_path in submissions:
dir_name = Path(file_path).parent.name
if dir_name in processed_result_dirs:
continue
processed_result_dirs.add(dir_name)
entry = {LDB_COLS[0]: dir_name,
LDB_COLS[1]: 'In Progress...',
LDB_COLS[2]: 'In Progress...',
LDB_COLS[3]: 'In Progress...',
LDB_COLS[4]: 'In Progress...'}
# check if summary file exists, otherwise skip
if f"{DS_RESULTS_PATH}/{dir_name}/summary.txt" not in repo_files:
leaderboard_entries.append(entry)
continue
# Download summary file
temp_dir = os.path.join("temp_hf_downloads", dir_name)
local_summary_path = hf_hub_download(
repo_id=DATASET_REPO_ID,
filename=f"{DS_RESULTS_PATH}/{dir_name}/summary.txt",
repo_type="dataset",
local_dir=temp_dir,
)
if Path(local_summary_path).exists():
with open(local_summary_path, "r", encoding="utf-8") as f:
for line in f:
if 'Execution perc' in line:
entry[LDB_COLS[1]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Consistency perc' in line:
entry[LDB_COLS[2]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Final Solution Accuracy' in line:
entry[LDB_COLS[3]] = float(line.split(":")[1].strip().replace("%", ""))
if 'Total Submitted Models Parsed' in line:
entry[LDB_COLS[4]] = int(line.split(":")[1].strip())
os.remove(local_summary_path)
leaderboard_entries.append(entry)
except Exception as e:
print(f"Error loading leaderboard data: {e}")
finally:
# Clean up
if Path("temp_hf_downloads").exists():
shutil.rmtree("temp_hf_downloads", ignore_errors=True)
if not leaderboard_entries:
return pd.DataFrame(columns=LDB_COLS)
return pd.DataFrame(leaderboard_entries)
def upload_submission(uploaded_file, dir_name, report_file, model_framework):
"""Upload submission to Hugging Face Dataset."""
if not HF_API:
return False, "Hugging Face API not initialized"
try:
submission_path = f"{DS_SUBMISSIONS_PATH}/{dir_name}"
HF_API.upload_file(
path_or_fileobj=uploaded_file,
path_in_repo=f"{submission_path}/submission.jsonl",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload submission: {dir_name}"
)
if report_file:
HF_API.upload_file(
path_or_fileobj=report_file,
path_in_repo=f"{submission_path}/report.pdf",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload report for submission: {dir_name}"
)
# create a file for metadata
metadata = {
"submission_name": dir_name,
"modelling_framework": model_framework,
}
HF_API.upload_file(
path_or_fileobj=io.BytesIO(json.dumps(metadata, indent=4).encode('utf-8')),
path_in_repo=f"{submission_path}/metadata.json",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
commit_message=f"Upload metadata for submission: {dir_name}"
)
return True, submission_path
except Exception as e:
return False, f"Upload error: {str(e)}"
def check_name_exists(submission_name):
if not HF_API:
return False
try:
repo_files = HF_API.list_repo_files(repo_id=DATASET_REPO_ID, repo_type="dataset")
for file_path in repo_files:
if file_path.startswith(f"{DS_SUBMISSIONS_PATH}/{submission_name}"):
return True
except Exception as e:
print(f"Error checking name existence: {e}")
return False